478 files changed, 133441 insertions, 0 deletions
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
new file mode 100644
index 000000000000..18465143cfa2
--- /dev/null
+++ b/arch/x86/boot/.gitignore
@@ -0,0 +1,5 @@
+bootsect
+bzImage
+setup
+setup.bin
+setup.elf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
new file mode 100644
index 000000000000..cb1035f2b7e9
--- /dev/null
+++ b/arch/x86/boot/Makefile
@@ -0,0 +1,171 @@
+#
+# arch/x86/boot/Makefile
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 1994 by Linus Torvalds
+#
+# ROOT_DEV specifies the default root-device when making the image.
+# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
+# the default of FLOPPY is used by 'build'.
+ROOT_DEV := CURRENT
+# If you want to preset the SVGA mode, uncomment the next line and
+# set SVGA_MODE to whatever number you want.
+# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
+# The number is the same as you would ordinarily press at bootup.
+SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
+# If you want the RAM disk device, define this to be the size in blocks.
+#RAMDISK := -DRAMDISK=512
+targets         := vmlinux.bin setup.bin setup.elf zImage bzImage
+subdir-         := compressed
+setup-y         += a20.o apm.o cmdline.o copy.o cpu.o cpucheck.o edd.o
+setup-y         += header.o main.o mca.o memory.o pm.o pmjump.o
+setup-y         += printf.o string.o tty.o video.o version.o voyager.o
+# The link order of the video-*.o modules can matter.  In particular,
+# video-vga.o *must* be listed first, followed by video-vesa.o.
+# Hardware-specific drivers should follow in the order they should be
+# probed, and video-bios.o should typically be last.
+setup-y         += video-vga.o
+setup-y         += video-vesa.o
+setup-y         += video-bios.o
+targets         += $(setup-y)
+hostprogs-y     := tools/build
+HOSTCFLAGS_build.o := $(LINUXINCLUDE)
+# ---------------------------------------------------------------------------
+# How to compile the 16-bit code.  Note we always compile for -march=i386,
+# that way we can complain to the user if the CPU is insufficient.
+cflags-i386   := 
+cflags-x86_64 := -m32
+CFLAGS          := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
+                   $(cflags-$(ARCH)) \
+                   -Wall -Wstrict-prototypes \
+                   -march=i386 -mregparm=3 \
+                   -include $(srctree)/$(src)/code16gcc.h \
+                   -fno-strict-aliasing -fomit-frame-pointer \
+                   $(call cc-option, -ffreestanding) \
+                   $(call cc-option, -fno-toplevel-reorder,\
+                        $(call cc-option, -fno-unit-at-a-time)) \
+                   $(call cc-option, -fno-stack-protector) \
+                   $(call cc-option, -mpreferred-stack-boundary=2)
+AFLAGS          := $(CFLAGS) -D__ASSEMBLY__
+$(obj)/zImage:  IMAGE_OFFSET := 0x1000
+$(obj)/zImage:  EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK)
+$(obj)/bzImage: IMAGE_OFFSET := 0x100000
+$(obj)/bzImage: EXTRA_CFLAGS := -D__BIG_KERNEL__
+$(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
+$(obj)/bzImage: BUILDFLAGS   := -b
+quiet_cmd_image = BUILD   $@
+cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \
+            $(obj)/vmlinux.bin $(ROOT_DEV) > $@
+$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \
+                              $(obj)/vmlinux.bin $(obj)/tools/build FORCE
+        $(call if_changed,image)
+        @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
+$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
+        $(call if_changed,objcopy)
+SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
+LDFLAGS_setup.elf       := -T
+$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
+        $(call if_changed,ld)
+OBJCOPYFLAGS_setup.bin  := -O binary
+$(obj)/setup.bin: $(obj)/setup.elf FORCE
+        $(call if_changed,objcopy)
+$(obj)/compressed/vmlinux: FORCE
+        $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@
+# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
+FDARGS = 
+# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel
+FDINITRD =
+image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
+$(obj)/mtools.conf: $(src)/mtools.conf.in
+        sed -e 's|@OBJ@|$(obj)|g' < $< > $@
+# This requires write access to /dev/fd0
+zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
+        MTOOLSRC=$(obj)/mtools.conf mformat a:                  ; sync
+        syslinux /dev/fd0                                       ; sync
+        echo '$(image_cmdline)' | \
+                MTOOLSRC=$(src)/mtools.conf mcopy - a:syslinux.cfg
+        if [ -f '$(FDINITRD)' ] ; then \
+                MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
+        fi
+        MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux  ; sync
+# These require being root or having syslinux 2.02 or higher installed
+fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
+        dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
+        MTOOLSRC=$(obj)/mtools.conf mformat v:                  ; sync
+        syslinux $(obj)/fdimage                                 ; sync
+        echo '$(image_cmdline)' | \
+                MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg
+        if [ -f '$(FDINITRD)' ] ; then \
+                MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
+        fi
+        MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux  ; sync
+fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
+        dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
+        MTOOLSRC=$(obj)/mtools.conf mformat w:                  ; sync
+        syslinux $(obj)/fdimage                                 ; sync
+        echo '$(image_cmdline)' | \
+                MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg
+        if [ -f '$(FDINITRD)' ] ; then \
+                MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
+        fi
+        MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux  ; sync
+isoimage: $(BOOTIMAGE)
+        -rm -rf $(obj)/isoimage
+        mkdir $(obj)/isoimage
+        for i in lib lib64 share end ; do \
+                if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \
+                        cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \
+                        break ; \
+                fi ; \
+                if [ $$i = end ] ; then exit 1 ; fi ; \
+        done
+        cp $(BOOTIMAGE) $(obj)/isoimage/linux
+        echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
+        if [ -f '$(FDINITRD)' ] ; then \
+                cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
+        fi
+        mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
+                -no-emul-boot -boot-load-size 4 -boot-info-table \
+                $(obj)/isoimage
+        rm -rf $(obj)/isoimage
+zlilo: $(BOOTIMAGE)
+        if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
+        if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
+        cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz
+        cp System.map $(INSTALL_PATH)/
+        if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
+install:
+        sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
new file mode 100644
index 000000000000..31348d054fca
--- /dev/null
+++ b/arch/x86/boot/a20.c
@@ -0,0 +1,161 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/a20.c
+ *
+ * Enable A20 gate (return -1 on failure)
+ */
+#include "boot.h"
+#define MAX_8042_LOOPS  100000
+static int empty_8042(void)
+{
+        u8 status;
+        int loops = MAX_8042_LOOPS;
+        while (loops--) {
+                io_delay();
+                status = inb(0x64);
+                if (status & 1) {
+                        /* Read and discard input data */
+                        io_delay();
+                        (void)inb(0x60);
+                } else if (!(status & 2)) {
+                        /* Buffers empty, finished! */
+                        return 0;
+                }
+        }
+        return -1;
+}
+/* Returns nonzero if the A20 line is enabled.  The memory address
+   used as a test is the int $0x80 vector, which should be safe. */
+#define A20_TEST_ADDR   (4*0x80)
+#define A20_TEST_SHORT  32
+#define A20_TEST_LONG   2097152 /* 2^21 */
+static int a20_test(int loops)
+{
+        int ok = 0;
+        int saved, ctr;
+        set_fs(0x0000);
+        set_gs(0xffff);
+        saved = ctr = rdfs32(A20_TEST_ADDR);
+        while (loops--) {
+                wrfs32(++ctr, A20_TEST_ADDR);
+                io_delay();     /* Serialize and make delay constant */
+                ok = rdgs32(A20_TEST_ADDR+0x10) ^ ctr;
+                if (ok)
+                        break;
+        }
+        wrfs32(saved, A20_TEST_ADDR);
+        return ok;
+}
+/* Quick test to see if A20 is already enabled */
+static int a20_test_short(void)
+{
+        return a20_test(A20_TEST_SHORT);
+}
+/* Longer test that actually waits for A20 to come on line; this
+   is useful when dealing with the KBC or other slow external circuitry. */
+static int a20_test_long(void)
+{
+        return a20_test(A20_TEST_LONG);
+}
+static void enable_a20_bios(void)
+{
+        asm volatile("pushfl; int $0x15; popfl"
+                     : : "a" ((u16)0x2401));
+}
+static void enable_a20_kbc(void)
+{
+        empty_8042();
+        outb(0xd1, 0x64);       /* Command write */
+        empty_8042();
+        outb(0xdf, 0x60);       /* A20 on */
+        empty_8042();
+}
+static void enable_a20_fast(void)
+{
+        u8 port_a;
+        port_a = inb(0x92);     /* Configuration port A */
+        port_a |=  0x02;        /* Enable A20 */
+        port_a &= ~0x01;        /* Do not reset machine */
+        outb(port_a, 0x92);
+}
+/*
+ * Actual routine to enable A20; return 0 on ok, -1 on failure
+ */
+#define A20_ENABLE_LOOPS 255    /* Number of times to try */
+int enable_a20(void)
+{
+        int loops = A20_ENABLE_LOOPS;
+#if defined(CONFIG_X86_ELAN)
+        /* Elan croaks if we try to touch the KBC */
+        enable_a20_fast();
+        while (!a20_test_long())
+                ;
+        return 0;
+#elif defined(CONFIG_X86_VOYAGER)
+        /* On Voyager, a20_test() is unsafe? */
+        enable_a20_kbc();
+        return 0;
+#else
+        while (loops--) {
+                /* First, check to see if A20 is already enabled
+                   (legacy free, etc.) */
+                if (a20_test_short())
+                        return 0;
+                /* Next, try the BIOS (INT 0x15, AX=0x2401) */
+                enable_a20_bios();
+                if (a20_test_short())
+                        return 0;
+                /* Try enabling A20 through the keyboard controller */
+                empty_8042();
+                if (a20_test_short())
+                        return 0; /* BIOS worked, but with delayed reaction */
+                enable_a20_kbc();
+                if (a20_test_long())
+                        return 0;
+                /* Finally, try enabling the "fast A20 gate" */
+                enable_a20_fast();
+                if (a20_test_long())
+                        return 0;
+        }
+        return -1;
+#endif
+}
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
new file mode 100644
index 000000000000..eab50c55a3a5
--- /dev/null
+++ b/arch/x86/boot/apm.c
@@ -0,0 +1,98 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   Original APM BIOS checking by Stephen Rothwell, May 1994
+ *   (sfr@canb.auug.org.au)
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/apm.c
+ *
+ * Get APM BIOS information
+ */
+#include "boot.h"
+#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
+int query_apm_bios(void)
+{
+        u16 ax, bx, cx, dx, di;
+        u32 ebx, esi;
+        u8 err;
+        /* APM BIOS installation check */
+        ax = 0x5300;
+        bx = cx = 0;
+        asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
+                     : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
+                     : : "esi", "edi");
+        if (err)
+                return -1;              /* No APM BIOS */
+        if (bx != 0x504d)       /* "PM" signature */
+                return -1;
+        if (!(cx & 0x02))               /* 32 bits supported? */
+                return -1;
+        /* Disconnect first, just in case */
+        ax = 0x5304;
+        bx = 0;
+        asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
+                     : "+a" (ax), "+b" (bx)
+                     : : "ecx", "edx", "esi", "edi");
+        /* Paranoia */
+        ebx = esi = 0;
+        cx = dx = di = 0;
+        /* 32-bit connect */
+        asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6"
+                     : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx),
+                       "+S" (esi), "+D" (di), "=m" (err)
+                     : "a" (0x5303));
+        boot_params.apm_bios_info.cseg = ax;
+        boot_params.apm_bios_info.offset = ebx;
+        boot_params.apm_bios_info.cseg_16 = cx;
+        boot_params.apm_bios_info.dseg = dx;
+        boot_params.apm_bios_info.cseg_len = (u16)esi;
+        boot_params.apm_bios_info.cseg_16_len = esi >> 16;
+        boot_params.apm_bios_info.dseg_len = di;
+        if (err)
+                return -1;
+        /* Redo the installation check as the 32-bit connect;
+           some BIOSes return different flags this way... */
+        ax = 0x5300;
+        bx = cx = 0;
+        asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
+                     : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
+                     : : "esi", "edi");
+        if (err || bx != 0x504d) {
+                /* Failure with 32-bit connect, try to disconect and ignore */
+                ax = 0x5304;
+                bx = 0;
+                asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
+                             : "+a" (ax), "+b" (bx)
+                             : : "ecx", "edx", "esi", "edi");
+                return -1;
+        }
+        boot_params.apm_bios_info.version = ax;
+        boot_params.apm_bios_info.flags = cx;
+        return 0;
+}
+#endif
diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h
new file mode 100644
index 000000000000..8dcc8dc7db88
--- /dev/null
+++ b/arch/x86/boot/bitops.h
@@ -0,0 +1,45 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/bitops.h
+ *
+ * Very simple bitops for the boot code.
+ */
+#ifndef BOOT_BITOPS_H
+#define BOOT_BITOPS_H
+#define _LINUX_BITOPS_H         /* Inhibit inclusion of <linux/bitops.h> */
+static inline int constant_test_bit(int nr, const void *addr)
+{
+        const u32 *p = (const u32 *)addr;
+        return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
+}
+static inline int variable_test_bit(int nr, const void *addr)
+{
+        u8 v;
+        const u32 *p = (const u32 *)addr;
+        asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr));
+        return v;
+}
+#define test_bit(nr,addr) \
+(__builtin_constant_p(nr) ? \
+ constant_test_bit((nr),(addr)) : \
+ variable_test_bit((nr),(addr)))
+static inline void set_bit(int nr, void *addr)
+{
+        asm("btsl %1,%0" : "+m" (*(u32 *)addr) : "Ir" (nr));
+}
+#endif /* BOOT_BITOPS_H */
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
new file mode 100644
index 000000000000..20bab9431acb
--- /dev/null
+++ b/arch/x86/boot/boot.h
@@ -0,0 +1,296 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/boot.h
+ *
+ * Header file for the real-mode kernel code
+ */
+#ifndef BOOT_BOOT_H
+#define BOOT_BOOT_H
+#ifndef __ASSEMBLY__
+#include <stdarg.h>
+#include <linux/types.h>
+#include <linux/edd.h>
+#include <asm/boot.h>
+#include <asm/bootparam.h>
+/* Useful macros */
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+extern struct setup_header hdr;
+extern struct boot_params boot_params;
+/* Basic port I/O */
+static inline void outb(u8 v, u16 port)
+{
+        asm volatile("outb %0,%1" : : "a" (v), "dN" (port));
+}
+static inline u8 inb(u16 port)
+{
+        u8 v;
+        asm volatile("inb %1,%0" : "=a" (v) : "dN" (port));
+        return v;
+}
+static inline void outw(u16 v, u16 port)
+{
+        asm volatile("outw %0,%1" : : "a" (v), "dN" (port));
+}
+static inline u16 inw(u16 port)
+{
+        u16 v;
+        asm volatile("inw %1,%0" : "=a" (v) : "dN" (port));
+        return v;
+}
+static inline void outl(u32 v, u16 port)
+{
+        asm volatile("outl %0,%1" : : "a" (v), "dN" (port));
+}
+static inline u32 inl(u32 port)
+{
+        u32 v;
+        asm volatile("inl %1,%0" : "=a" (v) : "dN" (port));
+        return v;
+}
+static inline void io_delay(void)
+{
+        const u16 DELAY_PORT = 0x80;
+        asm volatile("outb %%al,%0" : : "dN" (DELAY_PORT));
+}
+/* These functions are used to reference data in other segments. */
+static inline u16 ds(void)
+{
+        u16 seg;
+        asm("movw %%ds,%0" : "=rm" (seg));
+        return seg;
+}
+static inline void set_fs(u16 seg)
+{
+        asm volatile("movw %0,%%fs" : : "rm" (seg));
+}
+static inline u16 fs(void)
+{
+        u16 seg;
+        asm volatile("movw %%fs,%0" : "=rm" (seg));
+        return seg;
+}
+static inline void set_gs(u16 seg)
+{
+        asm volatile("movw %0,%%gs" : : "rm" (seg));
+}
+static inline u16 gs(void)
+{
+        u16 seg;
+        asm volatile("movw %%gs,%0" : "=rm" (seg));
+        return seg;
+}
+typedef unsigned int addr_t;
+static inline u8 rdfs8(addr_t addr)
+{
+        u8 v;
+        asm volatile("movb %%fs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr));
+        return v;
+}
+static inline u16 rdfs16(addr_t addr)
+{
+        u16 v;
+        asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
+        return v;
+}
+static inline u32 rdfs32(addr_t addr)
+{
+        u32 v;
+        asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
+        return v;
+}
+static inline void wrfs8(u8 v, addr_t addr)
+{
+        asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "r" (v));
+}
+static inline void wrfs16(u16 v, addr_t addr)
+{
+        asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "r" (v));
+}
+static inline void wrfs32(u32 v, addr_t addr)
+{
+        asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "r" (v));
+}
+static inline u8 rdgs8(addr_t addr)
+{
+        u8 v;
+        asm volatile("movb %%gs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr));
+        return v;
+}
+static inline u16 rdgs16(addr_t addr)
+{
+        u16 v;
+        asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
+        return v;
+}
+static inline u32 rdgs32(addr_t addr)
+{
+        u32 v;
+        asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
+        return v;
+}
+static inline void wrgs8(u8 v, addr_t addr)
+{
+        asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "r" (v));
+}
+static inline void wrgs16(u16 v, addr_t addr)
+{
+        asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "r" (v));
+}
+static inline void wrgs32(u32 v, addr_t addr)
+{
+        asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "r" (v));
+}
+/* Note: these only return true/false, not a signed return value! */
+static inline int memcmp(const void *s1, const void *s2, size_t len)
+{
+        u8 diff;
+        asm("repe; cmpsb; setnz %0"
+            : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+        return diff;
+}
+static inline int memcmp_fs(const void *s1, addr_t s2, size_t len)
+{
+        u8 diff;
+        asm volatile("fs; repe; cmpsb; setnz %0"
+                     : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+        return diff;
+}
+static inline int memcmp_gs(const void *s1, addr_t s2, size_t len)
+{
+        u8 diff;
+        asm volatile("gs; repe; cmpsb; setnz %0"
+                     : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+        return diff;
+}
+static inline int isdigit(int ch)
+{
+        return (ch >= '0') && (ch <= '9');
+}
+/* Heap -- available for dynamic lists. */
+#define STACK_SIZE      512     /* Minimum number of bytes for stack */
+extern char _end[];
+extern char *HEAP;
+extern char *heap_end;
+#define RESET_HEAP() ((void *)( HEAP = _end ))
+static inline char *__get_heap(size_t s, size_t a, size_t n)
+{
+        char *tmp;
+        HEAP = (char *)(((size_t)HEAP+(a-1)) & ~(a-1));
+        tmp = HEAP;
+        HEAP += s*n;
+        return tmp;
+}
+#define GET_HEAP(type, n) \
+        ((type *)__get_heap(sizeof(type),__alignof__(type),(n)))
+static inline int heap_free(void)
+{
+        return heap_end-HEAP;
+}
+/* copy.S */
+void copy_to_fs(addr_t dst, void *src, size_t len);
+void *copy_from_fs(void *dst, addr_t src, size_t len);
+void copy_to_gs(addr_t dst, void *src, size_t len);
+void *copy_from_gs(void *dst, addr_t src, size_t len);
+void *memcpy(void *dst, void *src, size_t len);
+void *memset(void *dst, int c, size_t len);
+#define memcpy(d,s,l) __builtin_memcpy(d,s,l)
+#define memset(d,c,l) __builtin_memset(d,c,l)
+/* a20.c */
+int enable_a20(void);
+/* apm.c */
+int query_apm_bios(void);
+/* cmdline.c */
+int cmdline_find_option(const char *option, char *buffer, int bufsize);
+/* cpu.c, cpucheck.c */
+int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
+int validate_cpu(void);
+/* edd.c */
+void query_edd(void);
+/* header.S */
+void __attribute__((noreturn)) die(void);
+/* mca.c */
+int query_mca(void);
+/* memory.c */
+int detect_memory(void);
+/* pm.c */
+void __attribute__((noreturn)) go_to_protected_mode(void);
+/* pmjump.S */
+void __attribute__((noreturn))
+        protected_mode_jump(u32 entrypoint, u32 bootparams);
+/* printf.c */
+int sprintf(char *buf, const char *fmt, ...);
+int vsprintf(char *buf, const char *fmt, va_list args);
+int printf(const char *fmt, ...);
+/* string.c */
+int strcmp(const char *str1, const char *str2);
+size_t strnlen(const char *s, size_t maxlen);
+unsigned int atou(const char *s);
+/* tty.c */
+void puts(const char *);
+void putchar(int);
+int getchar(void);
+void kbd_flush(void);
+int getchar_timeout(void);
+/* video.c */
+void set_video(void);
+/* video-vesa.c */
+void vesa_store_edid(void);
+/* voyager.c */
+int query_voyager(void);
+#endif /* __ASSEMBLY__ */
+#endif /* BOOT_BOOT_H */
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
new file mode 100644
index 000000000000..34bb778c4357
--- /dev/null
+++ b/arch/x86/boot/cmdline.c
@@ -0,0 +1,97 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/cmdline.c
+ *
+ * Simple command-line parser for early boot.
+ */
+#include "boot.h"
+static inline int myisspace(u8 c)
+{
+        return c <= ' ';        /* Close enough approximation */
+}
+/*
+ * Find a non-boolean option, that is, "option=argument".  In accordance
+ * with standard Linux practice, if this option is repeated, this returns
+ * the last instance on the command line.
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+int cmdline_find_option(const char *option, char *buffer, int bufsize)
+{
+        u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
+        addr_t cptr;
+        char c;
+        int len = -1;
+        const char *opptr = NULL;
+        char *bufptr = buffer;
+        enum {
+                st_wordstart,   /* Start of word/after whitespace */
+                st_wordcmp,     /* Comparing this word */
+                st_wordskip,    /* Miscompare, skip */
+                st_bufcpy       /* Copying this to buffer */
+        } state = st_wordstart;
+        if (!cmdline_ptr || cmdline_ptr >= 0x100000)
+                return -1;      /* No command line, or inaccessible */
+        cptr = cmdline_ptr & 0xf;
+        set_fs(cmdline_ptr >> 4);
+        while (cptr < 0x10000 && (c = rdfs8(cptr++))) {
+                switch (state) {
+                case st_wordstart:
+                        if (myisspace(c))
+                                break;
+                        /* else */
+                        state = st_wordcmp;
+                        opptr = option;
+                        /* fall through */
+                case st_wordcmp:
+                        if (c == '=' && !*opptr) {
+                                len = 0;
+                                bufptr = buffer;
+                                state = st_bufcpy;
+                        } else if (myisspace(c)) {
+                                state = st_wordstart;
+                        } else if (c != *opptr++) {
+                                state = st_wordskip;
+                        }
+                        break;
+                case st_wordskip:
+                        if (myisspace(c))
+                                state = st_wordstart;
+                        break;
+                case st_bufcpy:
+                        if (myisspace(c)) {
+                                state = st_wordstart;
+                        } else {
+                                if (len < bufsize-1)
+                                        *bufptr++ = c;
+                                len++;
+                        }
+                        break;
+                }
+        }
+        if (bufsize)
+                *bufptr = '\0';
+        return len;
+}
diff --git a/arch/x86/boot/code16gcc.h b/arch/x86/boot/code16gcc.h
new file mode 100644
index 000000000000..d93e48010b61
--- /dev/null
+++ b/arch/x86/boot/code16gcc.h
@@ -0,0 +1,15 @@
+/*
+ * code16gcc.h
+ *
+ * This file is -include'd when compiling 16-bit C code.
+ * Note: this asm() needs to be emitted before gcc emits any code.
+ * Depending on gcc version, this requires -fno-unit-at-a-time or
+ * -fno-toplevel-reorder.
+ *
+ * Hopefully gcc will eventually have a real -m16 option so we can
+ * drop this hack long term.
+ */
+#ifndef __ASSEMBLY__
+asm(".code16gcc");
+#endif
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
new file mode 100644
index 000000000000..be0ed065249b
--- /dev/null
+++ b/arch/x86/boot/compressed/.gitignore
@@ -0,0 +1 @@
+relocs
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
new file mode 100644
index 000000000000..52c1db854520
--- /dev/null
+++ b/arch/x86/boot/compressed/Makefile
@@ -0,0 +1,5 @@
+ifeq ($(CONFIG_X86_32),y)
+include ${srctree}/arch/x86/boot/compressed/Makefile_32
+else
+include ${srctree}/arch/x86/boot/compressed/Makefile_64
+endif
diff --git a/arch/x86/boot/compressed/Makefile_32 b/arch/x86/boot/compressed/Makefile_32
new file mode 100644
index 000000000000..22613c652d22
--- /dev/null
+++ b/arch/x86/boot/compressed/Makefile_32
@@ -0,0 +1,50 @@
+#
+# linux/arch/x86/boot/compressed/Makefile
+#
+# create a compressed vmlinux image from the original vmlinux
+#
+targets         := vmlinux vmlinux.bin vmlinux.bin.gz head_32.o misc_32.o piggy.o \
+                        vmlinux.bin.all vmlinux.relocs
+EXTRA_AFLAGS    := -traditional
+LDFLAGS_vmlinux := -T
+hostprogs-y     := relocs
+CFLAGS  := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -O2 \
+           -fno-strict-aliasing -fPIC \
+           $(call cc-option,-ffreestanding) \
+           $(call cc-option,-fno-stack-protector)
+LDFLAGS := -m elf_i386
+$(obj)/vmlinux: $(src)/vmlinux_32.lds $(obj)/head_32.o $(obj)/misc_32.o $(obj)/piggy.o FORCE
+        $(call if_changed,ld)
+        @:
+$(obj)/vmlinux.bin: vmlinux FORCE
+        $(call if_changed,objcopy)
+quiet_cmd_relocs = RELOCS  $@
+      cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
+$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
+        $(call if_changed,relocs)
+vmlinux.bin.all-y := $(obj)/vmlinux.bin
+vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
+quiet_cmd_relocbin = BUILD   $@
+      cmd_relocbin = cat $(filter-out FORCE,$^) > $@
+$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
+        $(call if_changed,relocbin)
+ifdef CONFIG_RELOCATABLE
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
+        $(call if_changed,gzip)
+else
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+        $(call if_changed,gzip)
+endif
+LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
+$(obj)/piggy.o: $(src)/vmlinux_32.scr $(obj)/vmlinux.bin.gz FORCE
+        $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/Makefile_64 b/arch/x86/boot/compressed/Makefile_64
new file mode 100644
index 000000000000..dc6b3380cc45
--- /dev/null
+++ b/arch/x86/boot/compressed/Makefile_64
@@ -0,0 +1,30 @@
+#
+# linux/arch/x86/boot/compressed/Makefile
+#
+# create a compressed vmlinux image from the original vmlinux
+#
+targets         := vmlinux vmlinux.bin vmlinux.bin.gz head_64.o misc_64.o piggy.o
+CFLAGS := -m64 -D__KERNEL__ $(LINUXINCLUDE) -O2  \
+          -fno-strict-aliasing -fPIC -mcmodel=small \
+           $(call cc-option, -ffreestanding) \
+           $(call cc-option, -fno-stack-protector)
+AFLAGS  := $(CFLAGS) -D__ASSEMBLY__
+LDFLAGS := -m elf_x86_64
+LDFLAGS_vmlinux := -T
+$(obj)/vmlinux: $(src)/vmlinux_64.lds $(obj)/head_64.o $(obj)/misc_64.o $(obj)/piggy.o FORCE
+        $(call if_changed,ld)
+        @:
+$(obj)/vmlinux.bin: vmlinux FORCE
+        $(call if_changed,objcopy)
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+        $(call if_changed,gzip)
+LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
+$(obj)/piggy.o: $(obj)/vmlinux_64.scr $(obj)/vmlinux.bin.gz FORCE
+        $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
new file mode 100644
index 000000000000..f35ea2237522
--- /dev/null
+++ b/arch/x86/boot/compressed/head_32.S
@@ -0,0 +1,180 @@
+/*
+ *  linux/boot/head.S
+ *
+ *  Copyright (C) 1991, 1992, 1993  Linus Torvalds
+ */
+/*
+ *  head.S contains the 32-bit startup code.
+ *
+ * NOTE!!! Startup happens at absolute address 0x00001000, which is also where
+ * the page directory will exist. The startup code will be overwritten by
+ * the page directory. [According to comments etc elsewhere on a compressed
+ * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
+ *
+ * Page 0 is deliberately kept safe, since System Management Mode code in 
+ * laptops may need to access the BIOS data stored there.  This is also
+ * useful for future device drivers that either access the BIOS via VM86 
+ * mode.
+ */
+/*
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ */
+.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/boot.h>
+.section ".text.head","ax",@progbits
+        .globl startup_32
+startup_32:
+        cld
+        cli
+        movl $(__BOOT_DS),%eax
+        movl %eax,%ds
+        movl %eax,%es
+        movl %eax,%fs
+        movl %eax,%gs
+        movl %eax,%ss
+/* Calculate the delta between where we were compiled to run
+ * at and where we were actually loaded at.  This can only be done
+ * with a short local call on x86.  Nothing  else will tell us what
+ * address we are running at.  The reserved chunk of the real-mode
+ * data at 0x1e4 (defined as a scratch field) are used as the stack
+ * for this calculation. Only 4 bytes are needed.
+ */
+        leal (0x1e4+4)(%esi), %esp
+        call 1f
+1:      popl %ebp
+        subl $1b, %ebp
+/* %ebp contains the address we are loaded at by the boot loader and %ebx
+ * contains the address where we should move the kernel image temporarily
+ * for safe in-place decompression.
+ */
+#ifdef CONFIG_RELOCATABLE
+        movl    %ebp, %ebx
+        addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
+        andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
+#else
+        movl $LOAD_PHYSICAL_ADDR, %ebx
+#endif
+        /* Replace the compressed data size with the uncompressed size */
+        subl input_len(%ebp), %ebx
+        movl output_len(%ebp), %eax
+        addl %eax, %ebx
+        /* Add 8 bytes for every 32K input block */
+        shrl $12, %eax
+        addl %eax, %ebx
+        /* Add 32K + 18 bytes of extra slack */
+        addl $(32768 + 18), %ebx
+        /* Align on a 4K boundary */
+        addl $4095, %ebx
+        andl $~4095, %ebx
+/* Copy the compressed kernel to the end of our buffer
+ * where decompression in place becomes safe.
+ */
+        pushl %esi
+        leal _end(%ebp), %esi
+        leal _end(%ebx), %edi
+        movl $(_end - startup_32), %ecx
+        std
+        rep
+        movsb
+        cld
+        popl %esi
+/* Compute the kernel start address.
+ */
+#ifdef CONFIG_RELOCATABLE
+        addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
+        andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
+#else
+        movl    $LOAD_PHYSICAL_ADDR, %ebp
+#endif
+/*
+ * Jump to the relocated address.
+ */
+        leal relocated(%ebx), %eax
+        jmp *%eax
+.section ".text"
+relocated:
+/*
+ * Clear BSS
+ */
+        xorl %eax,%eax
+        leal _edata(%ebx),%edi
+        leal _end(%ebx), %ecx
+        subl %edi,%ecx
+        cld
+        rep
+        stosb
+/*
+ * Setup the stack for the decompressor
+ */
+        leal stack_end(%ebx), %esp
+/*
+ * Do the decompression, and jump to the new kernel..
+ */
+        movl output_len(%ebx), %eax
+        pushl %eax
+        pushl %ebp      # output address
+        movl input_len(%ebx), %eax
+        pushl %eax      # input_len
+        leal input_data(%ebx), %eax
+        pushl %eax      # input_data
+        leal _end(%ebx), %eax
+        pushl %eax      # end of the image as third argument
+        pushl %esi      # real mode pointer as second arg
+        call decompress_kernel
+        addl $20, %esp
+        popl %ecx
+#if CONFIG_RELOCATABLE
+/* Find the address of the relocations.
+ */
+        movl %ebp, %edi
+        addl %ecx, %edi
+/* Calculate the delta between where vmlinux was compiled to run
+ * and where it was actually loaded.
+ */
+        movl %ebp, %ebx
+        subl $LOAD_PHYSICAL_ADDR, %ebx
+        jz   2f         /* Nothing to be done if loaded at compiled addr. */
+/*
+ * Process relocations.
+ */
+1:      subl $4, %edi
+        movl 0(%edi), %ecx
+        testl %ecx, %ecx
+        jz 2f
+        addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
+        jmp 1b
+2:
+#endif
+/*
+ * Jump to the decompressed kernel.
+ */
+        xorl %ebx,%ebx
+        jmp *%ebp
+.bss
+.balign 4
+stack:
+        .fill 4096, 1, 0
+stack_end:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
new file mode 100644
index 000000000000..49467640751f
--- /dev/null
+++ b/arch/x86/boot/compressed/head_64.S
@@ -0,0 +1,311 @@
+/*
+ *  linux/boot/head.S
+ *
+ *  Copyright (C) 1991, 1992, 1993  Linus Torvalds
+ */
+/*
+ *  head.S contains the 32-bit startup code.
+ *
+ * NOTE!!! Startup happens at absolute address 0x00001000, which is also where
+ * the page directory will exist. The startup code will be overwritten by
+ * the page directory. [According to comments etc elsewhere on a compressed
+ * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
+ *
+ * Page 0 is deliberately kept safe, since System Management Mode code in 
+ * laptops may need to access the BIOS data stored there.  This is also
+ * useful for future device drivers that either access the BIOS via VM86 
+ * mode.
+ */
+/*
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ */
+.code32
+.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+.section ".text.head"
+        .code32
+        .globl startup_32
+startup_32:
+        cld
+        cli
+        movl    $(__KERNEL_DS), %eax
+        movl    %eax, %ds
+        movl    %eax, %es
+        movl    %eax, %ss
+/* Calculate the delta between where we were compiled to run
+ * at and where we were actually loaded at.  This can only be done
+ * with a short local call on x86.  Nothing  else will tell us what
+ * address we are running at.  The reserved chunk of the real-mode
+ * data at 0x1e4 (defined as a scratch field) are used as the stack
+ * for this calculation. Only 4 bytes are needed.
+ */
+        leal    (0x1e4+4)(%esi), %esp
+        call    1f
+1:      popl    %ebp
+        subl    $1b, %ebp
+/* setup a stack and make sure cpu supports long mode. */
+        movl    $user_stack_end, %eax
+        addl    %ebp, %eax
+        movl    %eax, %esp
+        call    verify_cpu
+        testl   %eax, %eax
+        jnz     no_longmode
+/* Compute the delta between where we were compiled to run at
+ * and where the code will actually run at.
+ */
+/* %ebp contains the address we are loaded at by the boot loader and %ebx
+ * contains the address where we should move the kernel image temporarily
+ * for safe in-place decompression.
+ */
+#ifdef CONFIG_RELOCATABLE
+        movl    %ebp, %ebx
+        addl    $(LARGE_PAGE_SIZE -1), %ebx
+        andl    $LARGE_PAGE_MASK, %ebx
+#else
+        movl    $CONFIG_PHYSICAL_START, %ebx
+#endif
+        /* Replace the compressed data size with the uncompressed size */
+        subl    input_len(%ebp), %ebx
+        movl    output_len(%ebp), %eax
+        addl    %eax, %ebx
+        /* Add 8 bytes for every 32K input block */
+        shrl    $12, %eax
+        addl    %eax, %ebx
+        /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
+        addl    $(32768 + 18 + 4095), %ebx
+        andl    $~4095, %ebx
+/*
+ * Prepare for entering 64 bit mode
+ */
+        /* Load new GDT with the 64bit segments using 32bit descriptor */
+        leal    gdt(%ebp), %eax
+        movl    %eax, gdt+2(%ebp)
+        lgdt    gdt(%ebp)
+        /* Enable PAE mode */
+        xorl    %eax, %eax
+        orl     $(1 << 5), %eax
+        movl    %eax, %cr4
+ /*
+  * Build early 4G boot pagetable
+  */
+        /* Initialize Page tables to 0*/
+        leal    pgtable(%ebx), %edi
+        xorl    %eax, %eax
+        movl    $((4096*6)/4), %ecx
+        rep     stosl
+        /* Build Level 4 */
+        leal    pgtable + 0(%ebx), %edi
+        leal    0x1007 (%edi), %eax
+        movl    %eax, 0(%edi)
+        /* Build Level 3 */
+        leal    pgtable + 0x1000(%ebx), %edi
+        leal    0x1007(%edi), %eax
+        movl    $4, %ecx
+1:      movl    %eax, 0x00(%edi)
+        addl    $0x00001000, %eax
+        addl    $8, %edi
+        decl    %ecx
+        jnz     1b
+        /* Build Level 2 */
+        leal    pgtable + 0x2000(%ebx), %edi
+        movl    $0x00000183, %eax
+        movl    $2048, %ecx
+1:      movl    %eax, 0(%edi)
+        addl    $0x00200000, %eax
+        addl    $8, %edi
+        decl    %ecx
+        jnz     1b
+        /* Enable the boot page tables */
+        leal    pgtable(%ebx), %eax
+        movl    %eax, %cr3
+        /* Enable Long mode in EFER (Extended Feature Enable Register) */
+        movl    $MSR_EFER, %ecx
+        rdmsr
+        btsl    $_EFER_LME, %eax
+        wrmsr
+        /* Setup for the jump to 64bit mode
+         *
+         * When the jump is performend we will be in long mode but
+         * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
+         * (and in turn EFER.LMA = 1).  To jump into 64bit mode we use
+         * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+         * We place all of the values on our mini stack so lret can
+         * used to perform that far jump.
+         */
+        pushl   $__KERNEL_CS
+        leal    startup_64(%ebp), %eax
+        pushl   %eax
+        /* Enter paged protected Mode, activating Long Mode */
+        movl    $0x80000001, %eax /* Enable Paging and Protected mode */
+        movl    %eax, %cr0
+        /* Jump from 32bit compatibility mode into 64bit mode. */
+        lret
+no_longmode:
+        /* This isn't an x86-64 CPU so hang */
+1:
+        hlt
+        jmp     1b
+#include "../../kernel/verify_cpu_64.S"
+        /* Be careful here startup_64 needs to be at a predictable
+         * address so I can export it in an ELF header.  Bootloaders
+         * should look at the ELF header to find this address, as
+         * it may change in the future.
+         */
+        .code64
+        .org 0x200
+ENTRY(startup_64)
+        /* We come here either from startup_32 or directly from a
+         * 64bit bootloader.  If we come here from a bootloader we depend on
+         * an identity mapped page table being provied that maps our
+         * entire text+data+bss and hopefully all of memory.
+         */
+        /* Setup data segments. */
+        xorl    %eax, %eax
+        movl    %eax, %ds
+        movl    %eax, %es
+        movl    %eax, %ss
+        movl    %eax, %fs
+        movl    %eax, %gs
+        lldt    %ax
+        movl    $0x20, %eax
+        ltr     %ax
+        /* Compute the decompressed kernel start address.  It is where
+         * we were loaded at aligned to a 2M boundary. %rbp contains the
+         * decompressed kernel start address.
+         *
+         * If it is a relocatable kernel then decompress and run the kernel
+         * from load address aligned to 2MB addr, otherwise decompress and
+         * run the kernel from CONFIG_PHYSICAL_START
+         */
+        /* Start with the delta to where the kernel will run at. */
+#ifdef CONFIG_RELOCATABLE
+        leaq    startup_32(%rip) /* - $startup_32 */, %rbp
+        addq    $(LARGE_PAGE_SIZE - 1), %rbp
+        andq    $LARGE_PAGE_MASK, %rbp
+        movq    %rbp, %rbx
+#else
+        movq    $CONFIG_PHYSICAL_START, %rbp
+        movq    %rbp, %rbx
+#endif
+        /* Replace the compressed data size with the uncompressed size */
+        movl    input_len(%rip), %eax
+        subq    %rax, %rbx
+        movl    output_len(%rip), %eax
+        addq    %rax, %rbx
+        /* Add 8 bytes for every 32K input block */
+        shrq    $12, %rax
+        addq    %rax, %rbx
+        /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
+        addq    $(32768 + 18 + 4095), %rbx
+        andq    $~4095, %rbx
+/* Copy the compressed kernel to the end of our buffer
+ * where decompression in place becomes safe.
+ */
+        leaq    _end(%rip), %r8
+        leaq    _end(%rbx), %r9
+        movq    $_end /* - $startup_32 */, %rcx
+1:      subq    $8, %r8
+        subq    $8, %r9
+        movq    0(%r8), %rax
+        movq    %rax, 0(%r9)
+        subq    $8, %rcx
+        jnz     1b
+/*
+ * Jump to the relocated address.
+ */
+        leaq    relocated(%rbx), %rax
+        jmp     *%rax
+.section ".text"
+relocated:
+/*
+ * Clear BSS
+ */
+        xorq    %rax, %rax
+        leaq    _edata(%rbx), %rdi
+        leaq    _end(%rbx), %rcx
+        subq    %rdi, %rcx
+        cld
+        rep
+        stosb
+        /* Setup the stack */
+        leaq    user_stack_end(%rip), %rsp
+        /* zero EFLAGS after setting rsp */
+        pushq   $0
+        popfq
+/*
+ * Do the decompression, and jump to the new kernel..
+ */
+        pushq   %rsi                    # Save the real mode argument
+        movq    %rsi, %rdi              # real mode address
+        leaq    _heap(%rip), %rsi       # _heap
+        leaq    input_data(%rip), %rdx  # input_data
+        movl    input_len(%rip), %eax
+        movq    %rax, %rcx              # input_len
+        movq    %rbp, %r8               # output
+        call    decompress_kernel
+        popq    %rsi
+/*
+ * Jump to the decompressed kernel.
+ */
+        jmp     *%rbp
+        .data
+gdt:
+        .word   gdt_end - gdt
+        .long   gdt
+        .word   0
+        .quad   0x0000000000000000      /* NULL descriptor */
+        .quad   0x00af9a000000ffff      /* __KERNEL_CS */
+        .quad   0x00cf92000000ffff      /* __KERNEL_DS */
+        .quad   0x0080890000000000      /* TS descriptor */
+        .quad   0x0000000000000000      /* TS continued */
+gdt_end:
+        .bss
+/* Stack for uncompression */
+        .balign 4
+user_stack:
+        .fill 4096,4,0
+user_stack_end:
diff --git a/arch/x86/boot/compressed/misc_32.c b/arch/x86/boot/compressed/misc_32.c
new file mode 100644
index 000000000000..b28505c544c9
--- /dev/null
+++ b/arch/x86/boot/compressed/misc_32.c
@@ -0,0 +1,379 @@
+/*
+ * misc.c
+ * 
+ * This is a collection of several routines from gzip-1.0.3 
+ * adapted for Linux.
+ *
+ * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
+ * puts by Nick Holloway 1993, better puts by Martin Mares 1995
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ */
+#undef CONFIG_PARAVIRT
+#include <linux/linkage.h>
+#include <linux/vmalloc.h>
+#include <linux/screen_info.h>
+#include <asm/io.h>
+#include <asm/page.h>
+#include <asm/boot.h>
+/* WARNING!!
+ * This code is compiled with -fPIC and it is relocated dynamically
+ * at run time, but no relocation processing is performed.
+ * This means that it is not safe to place pointers in static structures.
+ */
+/*
+ * Getting to provable safe in place decompression is hard.
+ * Worst case behaviours need to be analized.
+ * Background information:
+ *
+ * The file layout is:
+ *    magic[2]
+ *    method[1]
+ *    flags[1]
+ *    timestamp[4]
+ *    extraflags[1]
+ *    os[1]
+ *    compressed data blocks[N]
+ *    crc[4] orig_len[4]
+ *
+ * resulting in 18 bytes of non compressed data overhead.
+ *
+ * Files divided into blocks
+ * 1 bit (last block flag)
+ * 2 bits (block type)
+ *
+ * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
+ * The smallest block type encoding is always used.
+ *
+ * stored:
+ *    32 bits length in bytes.
+ *
+ * fixed:
+ *    magic fixed tree.
+ *    symbols.
+ *
+ * dynamic:
+ *    dynamic tree encoding.
+ *    symbols.
+ *
+ *
+ * The buffer for decompression in place is the length of the
+ * uncompressed data, plus a small amount extra to keep the algorithm safe.
+ * The compressed data is placed at the end of the buffer.  The output
+ * pointer is placed at the start of the buffer and the input pointer
+ * is placed where the compressed data starts.  Problems will occur
+ * when the output pointer overruns the input pointer.
+ *
+ * The output pointer can only overrun the input pointer if the input
+ * pointer is moving faster than the output pointer.  A condition only
+ * triggered by data whose compressed form is larger than the uncompressed
+ * form.
+ *
+ * The worst case at the block level is a growth of the compressed data
+ * of 5 bytes per 32767 bytes.
+ *
+ * The worst case internal to a compressed block is very hard to figure.
+ * The worst case can at least be boundined by having one bit that represents
+ * 32764 bytes and then all of the rest of the bytes representing the very
+ * very last byte.
+ *
+ * All of which is enough to compute an amount of extra data that is required
+ * to be safe.  To avoid problems at the block level allocating 5 extra bytes
+ * per 32767 bytes of data is sufficient.  To avoind problems internal to a block
+ * adding an extra 32767 bytes (the worst case uncompressed block size) is
+ * sufficient, to ensure that in the worst case the decompressed data for
+ * block will stop the byte before the compressed data for a block begins.
+ * To avoid problems with the compressed data's meta information an extra 18
+ * bytes are needed.  Leading to the formula:
+ *
+ * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
+ *
+ * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
+ * Adding 32768 instead of 32767 just makes for round numbers.
+ * Adding the decompressor_size is necessary as it musht live after all
+ * of the data as well.  Last I measured the decompressor is about 14K.
+ * 10K of actuall data and 4K of bss.
+ *
+ */
+/*
+ * gzip declarations
+ */
+#define OF(args)  args
+#define STATIC static
+#undef memset
+#undef memcpy
+#define memzero(s, n)     memset ((s), 0, (n))
+typedef unsigned char  uch;
+typedef unsigned short ush;
+typedef unsigned long  ulg;
+#define WSIZE 0x80000000        /* Window size must be at least 32k,
+                                 * and a power of two
+                                 * We don't actually have a window just
+                                 * a huge output buffer so I report
+                                 * a 2G windows size, as that should
+                                 * always be larger than our output buffer.
+                                 */
+static uch *inbuf;      /* input buffer */
+static uch *window;     /* Sliding window buffer, (and final output buffer) */
+static unsigned insize;  /* valid bytes in inbuf */
+static unsigned inptr;   /* index of next byte to be processed in inbuf */
+static unsigned outcnt;  /* bytes in output buffer */
+/* gzip flag byte */
+#define ASCII_FLAG   0x01 /* bit 0 set: file probably ASCII text */
+#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
+#define EXTRA_FIELD  0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME    0x08 /* bit 3 set: original file name present */
+#define COMMENT      0x10 /* bit 4 set: file comment present */
+#define ENCRYPTED    0x20 /* bit 5 set: file is encrypted */
+#define RESERVED     0xC0 /* bit 6,7:   reserved */
+#define get_byte()  (inptr < insize ? inbuf[inptr++] : fill_inbuf())
+                
+/* Diagnostic functions */
+#ifdef DEBUG
+#  define Assert(cond,msg) {if(!(cond)) error(msg);}
+#  define Trace(x) fprintf x
+#  define Tracev(x) {if (verbose) fprintf x ;}
+#  define Tracevv(x) {if (verbose>1) fprintf x ;}
+#  define Tracec(c,x) {if (verbose && (c)) fprintf x ;}
+#  define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;}
+#else
+#  define Assert(cond,msg)
+#  define Trace(x)
+#  define Tracev(x)
+#  define Tracevv(x)
+#  define Tracec(c,x)
+#  define Tracecv(c,x)
+#endif
+static int  fill_inbuf(void);
+static void flush_window(void);
+static void error(char *m);
+static void gzip_mark(void **);
+static void gzip_release(void **);
+  
+/*
+ * This is set up by the setup-routine at boot-time
+ */
+static unsigned char *real_mode; /* Pointer to real-mode data */
+#define RM_EXT_MEM_K   (*(unsigned short *)(real_mode + 0x2))
+#ifndef STANDARD_MEMORY_BIOS_CALL
+#define RM_ALT_MEM_K   (*(unsigned long *)(real_mode + 0x1e0))
+#endif
+#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0))
+extern unsigned char input_data[];
+extern int input_len;
+static long bytes_out = 0;
+static void *malloc(int size);
+static void free(void *where);
+static void *memset(void *s, int c, unsigned n);
+static void *memcpy(void *dest, const void *src, unsigned n);
+static void putstr(const char *);
+static unsigned long free_mem_ptr;
+static unsigned long free_mem_end_ptr;
+#define HEAP_SIZE             0x4000
+static char *vidmem = (char *)0xb8000;
+static int vidport;
+static int lines, cols;
+#ifdef CONFIG_X86_NUMAQ
+void *xquad_portio;
+#endif
+#include "../../../../lib/inflate.c"
+static void *malloc(int size)
+{
+        void *p;
+        if (size <0) error("Malloc error");
+        if (free_mem_ptr <= 0) error("Memory error");
+        free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
+        p = (void *)free_mem_ptr;
+        free_mem_ptr += size;
+        if (free_mem_ptr >= free_mem_end_ptr)
+                error("Out of memory");
+        return p;
+}
+static void free(void *where)
+{       /* Don't care */
+}
+static void gzip_mark(void **ptr)
+{
+        *ptr = (void *) free_mem_ptr;
+}
+static void gzip_release(void **ptr)
+{
+        free_mem_ptr = (unsigned long) *ptr;
+}
+ 
+static void scroll(void)
+{
+        int i;
+        memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 );
+        for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 )
+                vidmem[i] = ' ';
+}
+static void putstr(const char *s)
+{
+        int x,y,pos;
+        char c;
+        x = RM_SCREEN_INFO.orig_x;
+        y = RM_SCREEN_INFO.orig_y;
+        while ( ( c = *s++ ) != '\0' ) {
+                if ( c == '\n' ) {
+                        x = 0;
+                        if ( ++y >= lines ) {
+                                scroll();
+                                y--;
+                        }
+                } else {
+                        vidmem [ ( x + cols * y ) * 2 ] = c;
+                        if ( ++x >= cols ) {
+                                x = 0;
+                                if ( ++y >= lines ) {
+                                        scroll();
+                                        y--;
+                                }
+                        }
+                }
+        }
+        RM_SCREEN_INFO.orig_x = x;
+        RM_SCREEN_INFO.orig_y = y;
+        pos = (x + cols * y) * 2;       /* Update cursor position */
+        outb_p(14, vidport);
+        outb_p(0xff & (pos >> 9), vidport+1);
+        outb_p(15, vidport);
+        outb_p(0xff & (pos >> 1), vidport+1);
+}
+static void* memset(void* s, int c, unsigned n)
+{
+        int i;
+        char *ss = (char*)s;
+        for (i=0;i<n;i++) ss[i] = c;
+        return s;
+}
+static void* memcpy(void* dest, const void* src, unsigned n)
+{
+        int i;
+        char *d = (char *)dest, *s = (char *)src;
+        for (i=0;i<n;i++) d[i] = s[i];
+        return dest;
+}
+/* ===========================================================================
+ * Fill the input buffer. This is called only when the buffer is empty
+ * and at least one byte is really needed.
+ */
+static int fill_inbuf(void)
+{
+        error("ran out of input data");
+        return 0;
+}
+/* ===========================================================================
+ * Write the output window window[0..outcnt-1] and update crc and bytes_out.
+ * (Used for the decompressed data only.)
+ */
+static void flush_window(void)
+{
+        /* With my window equal to my output buffer
+         * I only need to compute the crc here.
+         */
+        ulg c = crc;         /* temporary variable */
+        unsigned n;
+        uch *in, ch;
+        in = window;
+        for (n = 0; n < outcnt; n++) {
+                ch = *in++;
+                c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
+        }
+        crc = c;
+        bytes_out += (ulg)outcnt;
+        outcnt = 0;
+}
+static void error(char *x)
+{
+        putstr("\n\n");
+        putstr(x);
+        putstr("\n\n -- System halted");
+        while(1);       /* Halt */
+}
+asmlinkage void decompress_kernel(void *rmode, unsigned long end,
+                        uch *input_data, unsigned long input_len, uch *output)
+{
+        real_mode = rmode;
+        if (RM_SCREEN_INFO.orig_video_mode == 7) {
+                vidmem = (char *) 0xb0000;
+                vidport = 0x3b4;
+        } else {
+                vidmem = (char *) 0xb8000;
+                vidport = 0x3d4;
+        }
+        lines = RM_SCREEN_INFO.orig_video_lines;
+        cols = RM_SCREEN_INFO.orig_video_cols;
+        window = output;        /* Output buffer (Normally at 1M) */
+        free_mem_ptr     = end; /* Heap  */
+        free_mem_end_ptr = end + HEAP_SIZE;
+        inbuf  = input_data;    /* Input buffer */
+        insize = input_len;
+        inptr  = 0;
+        if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1))
+                error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
+        if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff))
+                error("Destination address too large");
+#ifndef CONFIG_RELOCATABLE
+        if ((u32)output != LOAD_PHYSICAL_ADDR)
+                error("Wrong destination address");
+#endif
+        makecrc();
+        putstr("Uncompressing Linux... ");
+        gunzip();
+        putstr("Ok, booting the kernel.\n");
+        return;
+}
diff --git a/arch/x86/boot/compressed/misc_64.c b/arch/x86/boot/compressed/misc_64.c
new file mode 100644
index 000000000000..f932b0e89096
--- /dev/null
+++ b/arch/x86/boot/compressed/misc_64.c
@@ -0,0 +1,371 @@
+/*
+ * misc.c
+ * 
+ * This is a collection of several routines from gzip-1.0.3 
+ * adapted for Linux.
+ *
+ * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
+ * puts by Nick Holloway 1993, better puts by Martin Mares 1995
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ */
+#define _LINUX_STRING_H_ 1
+#define __LINUX_BITMAP_H 1
+#include <linux/linkage.h>
+#include <linux/screen_info.h>
+#include <asm/io.h>
+#include <asm/page.h>
+/* WARNING!!
+ * This code is compiled with -fPIC and it is relocated dynamically
+ * at run time, but no relocation processing is performed.
+ * This means that it is not safe to place pointers in static structures.
+ */
+/*
+ * Getting to provable safe in place decompression is hard.
+ * Worst case behaviours need to be analized.
+ * Background information:
+ *
+ * The file layout is:
+ *    magic[2]
+ *    method[1]
+ *    flags[1]
+ *    timestamp[4]
+ *    extraflags[1]
+ *    os[1]
+ *    compressed data blocks[N]
+ *    crc[4] orig_len[4]
+ *
+ * resulting in 18 bytes of non compressed data overhead.
+ *
+ * Files divided into blocks
+ * 1 bit (last block flag)
+ * 2 bits (block type)
+ *
+ * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
+ * The smallest block type encoding is always used.
+ *
+ * stored:
+ *    32 bits length in bytes.
+ *
+ * fixed:
+ *    magic fixed tree.
+ *    symbols.
+ *
+ * dynamic:
+ *    dynamic tree encoding.
+ *    symbols.
+ *
+ *
+ * The buffer for decompression in place is the length of the
+ * uncompressed data, plus a small amount extra to keep the algorithm safe.
+ * The compressed data is placed at the end of the buffer.  The output
+ * pointer is placed at the start of the buffer and the input pointer
+ * is placed where the compressed data starts.  Problems will occur
+ * when the output pointer overruns the input pointer.
+ *
+ * The output pointer can only overrun the input pointer if the input
+ * pointer is moving faster than the output pointer.  A condition only
+ * triggered by data whose compressed form is larger than the uncompressed
+ * form.
+ *
+ * The worst case at the block level is a growth of the compressed data
+ * of 5 bytes per 32767 bytes.
+ *
+ * The worst case internal to a compressed block is very hard to figure.
+ * The worst case can at least be boundined by having one bit that represents
+ * 32764 bytes and then all of the rest of the bytes representing the very
+ * very last byte.
+ *
+ * All of which is enough to compute an amount of extra data that is required
+ * to be safe.  To avoid problems at the block level allocating 5 extra bytes
+ * per 32767 bytes of data is sufficient.  To avoind problems internal to a block
+ * adding an extra 32767 bytes (the worst case uncompressed block size) is
+ * sufficient, to ensure that in the worst case the decompressed data for
+ * block will stop the byte before the compressed data for a block begins.
+ * To avoid problems with the compressed data's meta information an extra 18
+ * bytes are needed.  Leading to the formula:
+ *
+ * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
+ *
+ * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
+ * Adding 32768 instead of 32767 just makes for round numbers.
+ * Adding the decompressor_size is necessary as it musht live after all
+ * of the data as well.  Last I measured the decompressor is about 14K.
+ * 10K of actuall data and 4K of bss.
+ *
+ */
+/*
+ * gzip declarations
+ */
+#define OF(args)  args
+#define STATIC static
+#undef memset
+#undef memcpy
+#define memzero(s, n)     memset ((s), 0, (n))
+typedef unsigned char  uch;
+typedef unsigned short ush;
+typedef unsigned long  ulg;
+#define WSIZE 0x80000000        /* Window size must be at least 32k,
+                                 * and a power of two
+                                 * We don't actually have a window just
+                                 * a huge output buffer so I report
+                                 * a 2G windows size, as that should
+                                 * always be larger than our output buffer.
+                                 */
+static uch *inbuf;      /* input buffer */
+static uch *window;     /* Sliding window buffer, (and final output buffer) */
+static unsigned insize;  /* valid bytes in inbuf */
+static unsigned inptr;   /* index of next byte to be processed in inbuf */
+static unsigned outcnt;  /* bytes in output buffer */
+/* gzip flag byte */
+#define ASCII_FLAG   0x01 /* bit 0 set: file probably ASCII text */
+#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
+#define EXTRA_FIELD  0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME    0x08 /* bit 3 set: original file name present */
+#define COMMENT      0x10 /* bit 4 set: file comment present */
+#define ENCRYPTED    0x20 /* bit 5 set: file is encrypted */
+#define RESERVED     0xC0 /* bit 6,7:   reserved */
+#define get_byte()  (inptr < insize ? inbuf[inptr++] : fill_inbuf())
+                
+/* Diagnostic functions */
+#ifdef DEBUG
+#  define Assert(cond,msg) {if(!(cond)) error(msg);}
+#  define Trace(x) fprintf x
+#  define Tracev(x) {if (verbose) fprintf x ;}
+#  define Tracevv(x) {if (verbose>1) fprintf x ;}
+#  define Tracec(c,x) {if (verbose && (c)) fprintf x ;}
+#  define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;}
+#else
+#  define Assert(cond,msg)
+#  define Trace(x)
+#  define Tracev(x)
+#  define Tracevv(x)
+#  define Tracec(c,x)
+#  define Tracecv(c,x)
+#endif
+static int  fill_inbuf(void);
+static void flush_window(void);
+static void error(char *m);
+static void gzip_mark(void **);
+static void gzip_release(void **);
+  
+/*
+ * This is set up by the setup-routine at boot-time
+ */
+static unsigned char *real_mode; /* Pointer to real-mode data */
+#define RM_EXT_MEM_K   (*(unsigned short *)(real_mode + 0x2))
+#ifndef STANDARD_MEMORY_BIOS_CALL
+#define RM_ALT_MEM_K   (*(unsigned long *)(real_mode + 0x1e0))
+#endif
+#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0))
+extern unsigned char input_data[];
+extern int input_len;
+static long bytes_out = 0;
+static void *malloc(int size);
+static void free(void *where);
+static void *memset(void *s, int c, unsigned n);
+static void *memcpy(void *dest, const void *src, unsigned n);
+static void putstr(const char *);
+static long free_mem_ptr;
+static long free_mem_end_ptr;
+#define HEAP_SIZE             0x7000
+static char *vidmem = (char *)0xb8000;
+static int vidport;
+static int lines, cols;
+#include "../../../../lib/inflate.c"
+static void *malloc(int size)
+{
+        void *p;
+        if (size <0) error("Malloc error");
+        if (free_mem_ptr <= 0) error("Memory error");
+        free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
+        p = (void *)free_mem_ptr;
+        free_mem_ptr += size;
+        if (free_mem_ptr >= free_mem_end_ptr)
+                error("Out of memory");
+        return p;
+}
+static void free(void *where)
+{       /* Don't care */
+}
+static void gzip_mark(void **ptr)
+{
+        *ptr = (void *) free_mem_ptr;
+}
+static void gzip_release(void **ptr)
+{
+        free_mem_ptr = (long) *ptr;
+}
+ 
+static void scroll(void)
+{
+        int i;
+        memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 );
+        for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 )
+                vidmem[i] = ' ';
+}
+static void putstr(const char *s)
+{
+        int x,y,pos;
+        char c;
+        x = RM_SCREEN_INFO.orig_x;
+        y = RM_SCREEN_INFO.orig_y;
+        while ( ( c = *s++ ) != '\0' ) {
+                if ( c == '\n' ) {
+                        x = 0;
+                        if ( ++y >= lines ) {
+                                scroll();
+                                y--;
+                        }
+                } else {
+                        vidmem [ ( x + cols * y ) * 2 ] = c; 
+                        if ( ++x >= cols ) {
+                                x = 0;
+                                if ( ++y >= lines ) {
+                                        scroll();
+                                        y--;
+                                }
+                        }
+                }
+        }
+        RM_SCREEN_INFO.orig_x = x;
+        RM_SCREEN_INFO.orig_y = y;
+        pos = (x + cols * y) * 2;       /* Update cursor position */
+        outb_p(14, vidport);
+        outb_p(0xff & (pos >> 9), vidport+1);
+        outb_p(15, vidport);
+        outb_p(0xff & (pos >> 1), vidport+1);
+}
+static void* memset(void* s, int c, unsigned n)
+{
+        int i;
+        char *ss = (char*)s;
+        for (i=0;i<n;i++) ss[i] = c;
+        return s;
+}
+static void* memcpy(void* dest, const void* src, unsigned n)
+{
+        int i;
+        char *d = (char *)dest, *s = (char *)src;
+        for (i=0;i<n;i++) d[i] = s[i];
+        return dest;
+}
+/* ===========================================================================
+ * Fill the input buffer. This is called only when the buffer is empty
+ * and at least one byte is really needed.
+ */
+static int fill_inbuf(void)
+{
+        error("ran out of input data");
+        return 0;
+}
+/* ===========================================================================
+ * Write the output window window[0..outcnt-1] and update crc and bytes_out.
+ * (Used for the decompressed data only.)
+ */
+static void flush_window(void)
+{
+        /* With my window equal to my output buffer
+         * I only need to compute the crc here.
+         */
+        ulg c = crc;         /* temporary variable */
+        unsigned n;
+        uch *in, ch;
+        in = window;
+        for (n = 0; n < outcnt; n++) {
+                ch = *in++;
+                c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
+        }
+        crc = c;
+        bytes_out += (ulg)outcnt;
+        outcnt = 0;
+}
+static void error(char *x)
+{
+        putstr("\n\n");
+        putstr(x);
+        putstr("\n\n -- System halted");
+        while(1);       /* Halt */
+}
+asmlinkage void decompress_kernel(void *rmode, unsigned long heap,
+        uch *input_data, unsigned long input_len, uch *output)
+{
+        real_mode = rmode;
+        if (RM_SCREEN_INFO.orig_video_mode == 7) {
+                vidmem = (char *) 0xb0000;
+                vidport = 0x3b4;
+        } else {
+                vidmem = (char *) 0xb8000;
+                vidport = 0x3d4;
+        }
+        lines = RM_SCREEN_INFO.orig_video_lines;
+        cols = RM_SCREEN_INFO.orig_video_cols;
+        window = output;                /* Output buffer (Normally at 1M) */
+        free_mem_ptr     = heap;        /* Heap  */
+        free_mem_end_ptr = heap + HEAP_SIZE;
+        inbuf  = input_data;            /* Input buffer */
+        insize = input_len;
+        inptr  = 0;
+        if ((ulg)output & (__KERNEL_ALIGN - 1))
+                error("Destination address not 2M aligned");
+        if ((ulg)output >= 0xffffffffffUL)
+                error("Destination address too large");
+        makecrc();
+        putstr(".\nDecompressing Linux...");
+        gunzip();
+        putstr("done.\nBooting the kernel.\n");
+        return;
+}
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
new file mode 100644
index 000000000000..2d77ee728f92
--- /dev/null
+++ b/arch/x86/boot/compressed/relocs.c
@@ -0,0 +1,631 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <elf.h>
+#include <byteswap.h>
+#define USE_BSD
+#include <endian.h>
+#define MAX_SHDRS 100
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+static Elf32_Ehdr ehdr;
+static Elf32_Shdr shdr[MAX_SHDRS];
+static Elf32_Sym  *symtab[MAX_SHDRS];
+static Elf32_Rel  *reltab[MAX_SHDRS];
+static char *strtab[MAX_SHDRS];
+static unsigned long reloc_count, reloc_idx;
+static unsigned long *relocs;
+/*
+ * Following symbols have been audited. There values are constant and do
+ * not change if bzImage is loaded at a different physical address than
+ * the address for which it has been compiled. Don't warn user about
+ * absolute relocations present w.r.t these symbols.
+ */
+static const char* safe_abs_relocs[] = {
+                "__kernel_vsyscall",
+                "__kernel_rt_sigreturn",
+                "__kernel_sigreturn",
+                "SYSENTER_RETURN",
+                "VDSO_NOTE_MASK",
+                "xen_irq_disable_direct_reloc",
+                "xen_save_fl_direct_reloc",
+};
+static int is_safe_abs_reloc(const char* sym_name)
+{
+        int i, array_size;
+        array_size = sizeof(safe_abs_relocs)/sizeof(char*);
+        for(i = 0; i < array_size; i++) {
+                if (!strcmp(sym_name, safe_abs_relocs[i]))
+                        /* Match found */
+                        return 1;
+        }
+        if (strncmp(sym_name, "__crc_", 6) == 0)
+                return 1;
+        return 0;
+}
+static void die(char *fmt, ...)
+{
+        va_list ap;
+        va_start(ap, fmt);
+        vfprintf(stderr, fmt, ap);
+        va_end(ap);
+        exit(1);
+}
+static const char *sym_type(unsigned type)
+{
+        static const char *type_name[] = {
+#define SYM_TYPE(X) [X] = #X
+                SYM_TYPE(STT_NOTYPE),
+                SYM_TYPE(STT_OBJECT),
+                SYM_TYPE(STT_FUNC),
+                SYM_TYPE(STT_SECTION),
+                SYM_TYPE(STT_FILE),
+                SYM_TYPE(STT_COMMON),
+                SYM_TYPE(STT_TLS),
+#undef SYM_TYPE
+        };
+        const char *name = "unknown sym type name";
+        if (type < ARRAY_SIZE(type_name)) {
+                name = type_name[type];
+        }
+        return name;
+}
+static const char *sym_bind(unsigned bind)
+{
+        static const char *bind_name[] = {
+#define SYM_BIND(X) [X] = #X
+                SYM_BIND(STB_LOCAL),
+                SYM_BIND(STB_GLOBAL),
+                SYM_BIND(STB_WEAK),
+#undef SYM_BIND
+        };
+        const char *name = "unknown sym bind name";
+        if (bind < ARRAY_SIZE(bind_name)) {
+                name = bind_name[bind];
+        }
+        return name;
+}
+static const char *sym_visibility(unsigned visibility)
+{
+        static const char *visibility_name[] = {
+#define SYM_VISIBILITY(X) [X] = #X
+                SYM_VISIBILITY(STV_DEFAULT),
+                SYM_VISIBILITY(STV_INTERNAL),
+                SYM_VISIBILITY(STV_HIDDEN),
+                SYM_VISIBILITY(STV_PROTECTED),
+#undef SYM_VISIBILITY
+        };
+        const char *name = "unknown sym visibility name";
+        if (visibility < ARRAY_SIZE(visibility_name)) {
+                name = visibility_name[visibility];
+        }
+        return name;
+}
+static const char *rel_type(unsigned type)
+{
+        static const char *type_name[] = {
+#define REL_TYPE(X) [X] = #X
+                REL_TYPE(R_386_NONE),
+                REL_TYPE(R_386_32),
+                REL_TYPE(R_386_PC32),
+                REL_TYPE(R_386_GOT32),
+                REL_TYPE(R_386_PLT32),
+                REL_TYPE(R_386_COPY),
+                REL_TYPE(R_386_GLOB_DAT),
+                REL_TYPE(R_386_JMP_SLOT),
+                REL_TYPE(R_386_RELATIVE),
+                REL_TYPE(R_386_GOTOFF),
+                REL_TYPE(R_386_GOTPC),
+#undef REL_TYPE
+        };
+        const char *name = "unknown type rel type name";
+        if (type < ARRAY_SIZE(type_name)) {
+                name = type_name[type];
+        }
+        return name;
+}
+static const char *sec_name(unsigned shndx)
+{
+        const char *sec_strtab;
+        const char *name;
+        sec_strtab = strtab[ehdr.e_shstrndx];
+        name = "<noname>";
+        if (shndx < ehdr.e_shnum) {
+                name = sec_strtab + shdr[shndx].sh_name;
+        }
+        else if (shndx == SHN_ABS) {
+                name = "ABSOLUTE";
+        }
+        else if (shndx == SHN_COMMON) {
+                name = "COMMON";
+        }
+        return name;
+}
+static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
+{
+        const char *name;
+        name = "<noname>";
+        if (sym->st_name) {
+                name = sym_strtab + sym->st_name;
+        }
+        else {
+                name = sec_name(shdr[sym->st_shndx].sh_name);
+        }
+        return name;
+}
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define le16_to_cpu(val) (val)
+#define le32_to_cpu(val) (val)
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+#define le16_to_cpu(val) bswap_16(val)
+#define le32_to_cpu(val) bswap_32(val)
+#endif
+static uint16_t elf16_to_cpu(uint16_t val)
+{
+        return le16_to_cpu(val);
+}
+static uint32_t elf32_to_cpu(uint32_t val)
+{
+        return le32_to_cpu(val);
+}
+static void read_ehdr(FILE *fp)
+{
+        if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
+                die("Cannot read ELF header: %s\n",
+                        strerror(errno));
+        }
+        if (memcmp(ehdr.e_ident, ELFMAG, 4) != 0) {
+                die("No ELF magic\n");
+        }
+        if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
+                die("Not a 32 bit executable\n");
+        }
+        if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
+                die("Not a LSB ELF executable\n");
+        }
+        if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
+                die("Unknown ELF version\n");
+        }
+        /* Convert the fields to native endian */
+        ehdr.e_type      = elf16_to_cpu(ehdr.e_type);
+        ehdr.e_machine   = elf16_to_cpu(ehdr.e_machine);
+        ehdr.e_version   = elf32_to_cpu(ehdr.e_version);
+        ehdr.e_entry     = elf32_to_cpu(ehdr.e_entry);
+        ehdr.e_phoff     = elf32_to_cpu(ehdr.e_phoff);
+        ehdr.e_shoff     = elf32_to_cpu(ehdr.e_shoff);
+        ehdr.e_flags     = elf32_to_cpu(ehdr.e_flags);
+        ehdr.e_ehsize    = elf16_to_cpu(ehdr.e_ehsize);
+        ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize);
+        ehdr.e_phnum     = elf16_to_cpu(ehdr.e_phnum);
+        ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize);
+        ehdr.e_shnum     = elf16_to_cpu(ehdr.e_shnum);
+        ehdr.e_shstrndx  = elf16_to_cpu(ehdr.e_shstrndx);
+        if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) {
+                die("Unsupported ELF header type\n");
+        }
+        if (ehdr.e_machine != EM_386) {
+                die("Not for x86\n");
+        }
+        if (ehdr.e_version != EV_CURRENT) {
+                die("Unknown ELF version\n");
+        }
+        if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) {
+                die("Bad Elf header size\n");
+        }
+        if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) {
+                die("Bad program header entry\n");
+        }
+        if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) {
+                die("Bad section header entry\n");
+        }
+        if (ehdr.e_shstrndx >= ehdr.e_shnum) {
+                die("String table index out of bounds\n");
+        }
+}
+static void read_shdrs(FILE *fp)
+{
+        int i;
+        if (ehdr.e_shnum > MAX_SHDRS) {
+                die("%d section headers supported: %d\n",
+                        ehdr.e_shnum, MAX_SHDRS);
+        }
+        if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
+                die("Seek to %d failed: %s\n",
+                        ehdr.e_shoff, strerror(errno));
+        }
+        if (fread(&shdr, sizeof(shdr[0]), ehdr.e_shnum, fp) != ehdr.e_shnum) {
+                die("Cannot read ELF section headers: %s\n",
+                        strerror(errno));
+        }
+        for(i = 0; i < ehdr.e_shnum; i++) {
+                shdr[i].sh_name      = elf32_to_cpu(shdr[i].sh_name);
+                shdr[i].sh_type      = elf32_to_cpu(shdr[i].sh_type);
+                shdr[i].sh_flags     = elf32_to_cpu(shdr[i].sh_flags);
+                shdr[i].sh_addr      = elf32_to_cpu(shdr[i].sh_addr);
+                shdr[i].sh_offset    = elf32_to_cpu(shdr[i].sh_offset);
+                shdr[i].sh_size      = elf32_to_cpu(shdr[i].sh_size);
+                shdr[i].sh_link      = elf32_to_cpu(shdr[i].sh_link);
+                shdr[i].sh_info      = elf32_to_cpu(shdr[i].sh_info);
+                shdr[i].sh_addralign = elf32_to_cpu(shdr[i].sh_addralign);
+                shdr[i].sh_entsize   = elf32_to_cpu(shdr[i].sh_entsize);
+        }
+}
+static void read_strtabs(FILE *fp)
+{
+        int i;
+        for(i = 0; i < ehdr.e_shnum; i++) {
+                if (shdr[i].sh_type != SHT_STRTAB) {
+                        continue;
+                }
+                strtab[i] = malloc(shdr[i].sh_size);
+                if (!strtab[i]) {
+                        die("malloc of %d bytes for strtab failed\n",
+                                shdr[i].sh_size);
+                }
+                if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
+                        die("Seek to %d failed: %s\n",
+                                shdr[i].sh_offset, strerror(errno));
+                }
+                if (fread(strtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
+                        die("Cannot read symbol table: %s\n",
+                                strerror(errno));
+                }
+        }
+}
+static void read_symtabs(FILE *fp)
+{
+        int i,j;
+        for(i = 0; i < ehdr.e_shnum; i++) {
+                if (shdr[i].sh_type != SHT_SYMTAB) {
+                        continue;
+                }
+                symtab[i] = malloc(shdr[i].sh_size);
+                if (!symtab[i]) {
+                        die("malloc of %d bytes for symtab failed\n",
+                                shdr[i].sh_size);
+                }
+                if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
+                        die("Seek to %d failed: %s\n",
+                                shdr[i].sh_offset, strerror(errno));
+                }
+                if (fread(symtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
+                        die("Cannot read symbol table: %s\n",
+                                strerror(errno));
+                }
+                for(j = 0; j < shdr[i].sh_size/sizeof(symtab[i][0]); j++) {
+                        symtab[i][j].st_name  = elf32_to_cpu(symtab[i][j].st_name);
+                        symtab[i][j].st_value = elf32_to_cpu(symtab[i][j].st_value);
+                        symtab[i][j].st_size  = elf32_to_cpu(symtab[i][j].st_size);
+                        symtab[i][j].st_shndx = elf16_to_cpu(symtab[i][j].st_shndx);
+                }
+        }
+}
+static void read_relocs(FILE *fp)
+{
+        int i,j;
+        for(i = 0; i < ehdr.e_shnum; i++) {
+                if (shdr[i].sh_type != SHT_REL) {
+                        continue;
+                }
+                reltab[i] = malloc(shdr[i].sh_size);
+                if (!reltab[i]) {
+                        die("malloc of %d bytes for relocs failed\n",
+                                shdr[i].sh_size);
+                }
+                if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
+                        die("Seek to %d failed: %s\n",
+                                shdr[i].sh_offset, strerror(errno));
+                }
+                if (fread(reltab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
+                        die("Cannot read symbol table: %s\n",
+                                strerror(errno));
+                }
+                for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
+                        reltab[i][j].r_offset = elf32_to_cpu(reltab[i][j].r_offset);
+                        reltab[i][j].r_info   = elf32_to_cpu(reltab[i][j].r_info);
+                }
+        }
+}
+static void print_absolute_symbols(void)
+{
+        int i;
+        printf("Absolute symbols\n");
+        printf(" Num:    Value Size  Type       Bind        Visibility  Name\n");
+        for(i = 0; i < ehdr.e_shnum; i++) {
+                char *sym_strtab;
+                Elf32_Sym *sh_symtab;
+                int j;
+                if (shdr[i].sh_type != SHT_SYMTAB) {
+                        continue;
+                }
+                sh_symtab = symtab[i];
+                sym_strtab = strtab[shdr[i].sh_link];
+                for(j = 0; j < shdr[i].sh_size/sizeof(symtab[0][0]); j++) {
+                        Elf32_Sym *sym;
+                        const char *name;
+                        sym = &symtab[i][j];
+                        name = sym_name(sym_strtab, sym);
+                        if (sym->st_shndx != SHN_ABS) {
+                                continue;
+                        }
+                        printf("%5d %08x %5d %10s %10s %12s %s\n",
+                                j, sym->st_value, sym->st_size,
+                                sym_type(ELF32_ST_TYPE(sym->st_info)),
+                                sym_bind(ELF32_ST_BIND(sym->st_info)),
+                                sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)),
+                                name);
+                }
+        }
+        printf("\n");
+}
+static void print_absolute_relocs(void)
+{
+        int i, printed = 0;
+        for(i = 0; i < ehdr.e_shnum; i++) {
+                char *sym_strtab;
+                Elf32_Sym *sh_symtab;
+                unsigned sec_applies, sec_symtab;
+                int j;
+                if (shdr[i].sh_type != SHT_REL) {
+                        continue;
+                }
+                sec_symtab  = shdr[i].sh_link;
+                sec_applies = shdr[i].sh_info;
+                if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) {
+                        continue;
+                }
+                sh_symtab = symtab[sec_symtab];
+                sym_strtab = strtab[shdr[sec_symtab].sh_link];
+                for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
+                        Elf32_Rel *rel;
+                        Elf32_Sym *sym;
+                        const char *name;
+                        rel = &reltab[i][j];
+                        sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
+                        name = sym_name(sym_strtab, sym);
+                        if (sym->st_shndx != SHN_ABS) {
+                                continue;
+                        }
+                        /* Absolute symbols are not relocated if bzImage is
+                         * loaded at a non-compiled address. Display a warning
+                         * to user at compile time about the absolute
+                         * relocations present.
+                         *
+                         * User need to audit the code to make sure
+                         * some symbols which should have been section
+                         * relative have not become absolute because of some
+                         * linker optimization or wrong programming usage.
+                         *
+                         * Before warning check if this absolute symbol
+                         * relocation is harmless.
+                         */
+                        if (is_safe_abs_reloc(name))
+                                continue;
+                        if (!printed) {
+                                printf("WARNING: Absolute relocations"
+                                        " present\n");
+                                printf("Offset     Info     Type     Sym.Value "
+                                        "Sym.Name\n");
+                                printed = 1;
+                        }
+                        printf("%08x %08x %10s %08x  %s\n",
+                                rel->r_offset,
+                                rel->r_info,
+                                rel_type(ELF32_R_TYPE(rel->r_info)),
+                                sym->st_value,
+                                name);
+                }
+        }
+        if (printed)
+                printf("\n");
+}
+static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
+{
+        int i;
+        /* Walk through the relocations */
+        for(i = 0; i < ehdr.e_shnum; i++) {
+                char *sym_strtab;
+                Elf32_Sym *sh_symtab;
+                unsigned sec_applies, sec_symtab;
+                int j;
+                if (shdr[i].sh_type != SHT_REL) {
+                        continue;
+                }
+                sec_symtab  = shdr[i].sh_link;
+                sec_applies = shdr[i].sh_info;
+                if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) {
+                        continue;
+                }
+                sh_symtab = symtab[sec_symtab];
+                sym_strtab = strtab[shdr[sec_symtab].sh_link];
+                for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
+                        Elf32_Rel *rel;
+                        Elf32_Sym *sym;
+                        unsigned r_type;
+                        rel = &reltab[i][j];
+                        sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
+                        r_type = ELF32_R_TYPE(rel->r_info);
+                        /* Don't visit relocations to absolute symbols */
+                        if (sym->st_shndx == SHN_ABS) {
+                                continue;
+                        }
+                        if (r_type == R_386_PC32) {
+                                /* PC relative relocations don't need to be adjusted */
+                        }
+                        else if (r_type == R_386_32) {
+                                /* Visit relocations that need to be adjusted */
+                                visit(rel, sym);
+                        }
+                        else {
+                                die("Unsupported relocation type: %d\n", r_type);
+                        }
+                }
+        }
+}
+static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
+{
+        reloc_count += 1;
+}
+static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
+{
+        /* Remember the address that needs to be adjusted. */
+        relocs[reloc_idx++] = rel->r_offset;
+}
+static int cmp_relocs(const void *va, const void *vb)
+{
+        const unsigned long *a, *b;
+        a = va; b = vb;
+        return (*a == *b)? 0 : (*a > *b)? 1 : -1;
+}
+static void emit_relocs(int as_text)
+{
+        int i;
+        /* Count how many relocations I have and allocate space for them. */
+        reloc_count = 0;
+        walk_relocs(count_reloc);
+        relocs = malloc(reloc_count * sizeof(relocs[0]));
+        if (!relocs) {
+                die("malloc of %d entries for relocs failed\n",
+                        reloc_count);
+        }
+        /* Collect up the relocations */
+        reloc_idx = 0;
+        walk_relocs(collect_reloc);
+        /* Order the relocations for more efficient processing */
+        qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs);
+        /* Print the relocations */
+        if (as_text) {
+                /* Print the relocations in a form suitable that
+                 * gas will like.
+                 */
+                printf(".section \".data.reloc\",\"a\"\n");
+                printf(".balign 4\n");
+                for(i = 0; i < reloc_count; i++) {
+                        printf("\t .long 0x%08lx\n", relocs[i]);
+                }
+                printf("\n");
+        }
+        else {
+                unsigned char buf[4];
+                buf[0] = buf[1] = buf[2] = buf[3] = 0;
+                /* Print a stop */
+                printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
+                /* Now print each relocation */
+                for(i = 0; i < reloc_count; i++) {
+                        buf[0] = (relocs[i] >>  0) & 0xff;
+                        buf[1] = (relocs[i] >>  8) & 0xff;
+                        buf[2] = (relocs[i] >> 16) & 0xff;
+                        buf[3] = (relocs[i] >> 24) & 0xff;
+                        printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
+                }
+        }
+}
+static void usage(void)
+{
+        die("relocs [--abs-syms |--abs-relocs | --text] vmlinux\n");
+}
+int main(int argc, char **argv)
+{
+        int show_absolute_syms, show_absolute_relocs;
+        int as_text;
+        const char *fname;
+        FILE *fp;
+        int i;
+        show_absolute_syms = 0;
+        show_absolute_relocs = 0;
+        as_text = 0;
+        fname = NULL;
+        for(i = 1; i < argc; i++) {
+                char *arg = argv[i];
+                if (*arg == '-') {
+                        if (strcmp(argv[1], "--abs-syms") == 0) {
+                                show_absolute_syms = 1;
+                                continue;
+                        }
+                        if (strcmp(argv[1], "--abs-relocs") == 0) {
+                                show_absolute_relocs = 1;
+                                continue;
+                        }
+                        else if (strcmp(argv[1], "--text") == 0) {
+                                as_text = 1;
+                                continue;
+                        }
+                }
+                else if (!fname) {
+                        fname = arg;
+                        continue;
+                }
+                usage();
+        }
+        if (!fname) {
+                usage();
+        }
+        fp = fopen(fname, "r");
+        if (!fp) {
+                die("Cannot open %s: %s\n",
+                        fname, strerror(errno));
+        }
+        read_ehdr(fp);
+        read_shdrs(fp);
+        read_strtabs(fp);
+        read_symtabs(fp);
+        read_relocs(fp);
+        if (show_absolute_syms) {
+                print_absolute_symbols();
+                return 0;
+        }
+        if (show_absolute_relocs) {
+                print_absolute_relocs();
+                return 0;
+        }
+        emit_relocs(as_text);
+        return 0;
+}
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
new file mode 100644
index 000000000000..cc4854f6c6c1
--- /dev/null
+++ b/arch/x86/boot/compressed/vmlinux_32.lds
@@ -0,0 +1,43 @@
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+ENTRY(startup_32)
+SECTIONS
+{
+        /* Be careful parts of head.S assume startup_32 is at
+         * address 0.
+         */
+        . =  0  ;
+        .text.head : {
+                _head = . ;
+                *(.text.head)
+                _ehead = . ;
+        }
+        .data.compressed : {
+                *(.data.compressed)
+        }
+        .text : {
+                _text = .;      /* Text */
+                *(.text)
+                *(.text.*)
+                _etext = . ;
+        }
+        .rodata : {
+                _rodata = . ;
+                *(.rodata)       /* read-only data */
+                *(.rodata.*)
+                _erodata = . ;
+        }
+        .data : {
+                _data = . ;
+                *(.data)
+                *(.data.*)
+                _edata = . ;
+        }
+        .bss : {
+                _bss = . ;
+                *(.bss)
+                *(.bss.*)
+                *(COMMON)
+                _end = . ;
+        }
+}
diff --git a/arch/x86/boot/compressed/vmlinux_32.scr b/arch/x86/boot/compressed/vmlinux_32.scr
new file mode 100644
index 000000000000..707a88f7f29e
--- /dev/null
+++ b/arch/x86/boot/compressed/vmlinux_32.scr
@@ -0,0 +1,10 @@
+SECTIONS
+{
+  .data.compressed : {
+        input_len = .;
+        LONG(input_data_end - input_data) input_data = .; 
+        *(.data) 
+        output_len = . - 4;
+        input_data_end = .; 
+        }
+}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux_64.lds
new file mode 100644
index 000000000000..94c13e557fb4
--- /dev/null
+++ b/arch/x86/boot/compressed/vmlinux_64.lds
@@ -0,0 +1,44 @@
+OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(startup_64)
+SECTIONS
+{
+        /* Be careful parts of head.S assume startup_32 is at
+         * address 0.
+         */
+        . = 0;
+        .text : {
+                _head = . ;
+                *(.text.head)
+                _ehead = . ;
+                *(.text.compressed)
+                _text = .;      /* Text */
+                *(.text)
+                *(.text.*)
+                _etext = . ;
+        }
+        .rodata : {
+                _rodata = . ;
+                *(.rodata)       /* read-only data */
+                *(.rodata.*)
+                _erodata = . ;
+        }
+        .data : {
+                _data = . ;
+                *(.data)
+                *(.data.*)
+                _edata = . ;
+        }
+        .bss : {
+                _bss = . ;
+                *(.bss)
+                *(.bss.*)
+                *(COMMON)
+                . = ALIGN(8);
+                _end = . ;
+                . = ALIGN(4096);
+                pgtable = . ;
+                . = . + 4096 * 6;
+                _heap = .;
+        }
+}
diff --git a/arch/x86/boot/compressed/vmlinux_64.scr b/arch/x86/boot/compressed/vmlinux_64.scr
new file mode 100644
index 000000000000..bd1429ce193e
--- /dev/null
+++ b/arch/x86/boot/compressed/vmlinux_64.scr
@@ -0,0 +1,10 @@
+SECTIONS
+{
+  .text.compressed : {
+        input_len = .;
+        LONG(input_data_end - input_data) input_data = .;
+        *(.data)
+        output_len = . - 4;
+        input_data_end = .;
+        }
+}
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
new file mode 100644
index 000000000000..ef127e56a3cf
--- /dev/null
+++ b/arch/x86/boot/copy.S
@@ -0,0 +1,101 @@
+/* ----------------------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/copy.S
+ *
+ * Memory copy routines
+ */
+        .code16gcc
+        .text
+        .globl  memcpy
+        .type   memcpy, @function
+memcpy:
+        pushw   %si
+        pushw   %di
+        movw    %ax, %di
+        movw    %dx, %si
+        pushw   %cx
+        shrw    $2, %cx
+        rep; movsl
+        popw    %cx
+        andw    $3, %cx
+        rep; movsb
+        popw    %di
+        popw    %si
+        ret
+        .size   memcpy, .-memcpy
+        .globl  memset
+        .type   memset, @function
+memset:
+        pushw   %di
+        movw    %ax, %di
+        movzbl  %dl, %eax
+        imull   $0x01010101,%eax
+        pushw   %cx
+        shrw    $2, %cx
+        rep; stosl
+        popw    %cx
+        andw    $3, %cx
+        rep; stosb
+        popw    %di
+        ret
+        .size   memset, .-memset
+        .globl  copy_from_fs
+        .type   copy_from_fs, @function
+copy_from_fs:
+        pushw   %ds
+        pushw   %fs
+        popw    %ds
+        call    memcpy
+        popw    %ds
+        ret
+        .size   copy_from_fs, .-copy_from_fs
+        .globl  copy_to_fs
+        .type   copy_to_fs, @function
+copy_to_fs:
+        pushw   %es
+        pushw   %fs
+        popw    %es
+        call    memcpy
+        popw    %es
+        ret
+        .size   copy_to_fs, .-copy_to_fs
+#if 0 /* Not currently used, but can be enabled as needed */
+        .globl  copy_from_gs
+        .type   copy_from_gs, @function
+copy_from_gs:
+        pushw   %ds
+        pushw   %gs
+        popw    %ds
+        call    memcpy
+        popw    %ds
+        ret
+        .size   copy_from_gs, .-copy_from_gs
+        .globl  copy_to_gs
+        .type   copy_to_gs, @function
+copy_to_gs:
+        pushw   %es
+        pushw   %gs
+        popw    %es
+        call    memcpy
+        popw    %es
+        ret
+        .size   copy_to_gs, .-copy_to_gs
+#endif
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
new file mode 100644
index 000000000000..2a5c32da5852
--- /dev/null
+++ b/arch/x86/boot/cpu.c
@@ -0,0 +1,69 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/cpu.c
+ *
+ * Check for obligatory CPU features and abort if the features are not
+ * present.
+ */
+#include "boot.h"
+#include "bitops.h"
+#include <asm/cpufeature.h>
+static char *cpu_name(int level)
+{
+        static char buf[6];
+        if (level == 64) {
+                return "x86-64";
+        } else {
+                sprintf(buf, "i%d86", level);
+                return buf;
+        }
+}
+int validate_cpu(void)
+{
+        u32 *err_flags;
+        int cpu_level, req_level;
+        check_cpu(&cpu_level, &req_level, &err_flags);
+        if (cpu_level < req_level) {
+                printf("This kernel requires an %s CPU, ",
+                       cpu_name(req_level));
+                printf("but only detected an %s CPU.\n",
+                       cpu_name(cpu_level));
+                return -1;
+        }
+        if (err_flags) {
+                int i, j;
+                puts("This kernel requires the following features "
+                     "not present on the CPU:\n");
+                for (i = 0; i < NCAPINTS; i++) {
+                        u32 e = err_flags[i];
+                        for (j = 0; j < 32; j++) {
+                                if (e & 1)
+                                        printf("%d:%d ", i, j);
+                                e >>= 1;
+                        }
+                }
+                putchar('\n');
+                return -1;
+        } else {
+                return 0;
+        }
+}
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
new file mode 100644
index 000000000000..e655a89c5510
--- /dev/null
+++ b/arch/x86/boot/cpucheck.c
@@ -0,0 +1,268 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/cpucheck.c
+ *
+ * Check for obligatory CPU features and abort if the features are not
+ * present.  This code should be compilable as 16-, 32- or 64-bit
+ * code, so be very careful with types and inline assembly.
+ *
+ * This code should not contain any messages; that requires an
+ * additional wrapper.
+ *
+ * As written, this code is not safe for inclusion into the kernel
+ * proper (after FPU initialization, in particular).
+ */
+#ifdef _SETUP
+# include "boot.h"
+# include "bitops.h"
+#endif
+#include <linux/types.h>
+#include <asm/cpufeature.h>
+#include <asm/processor-flags.h>
+#include <asm/required-features.h>
+#include <asm/msr-index.h>
+struct cpu_features {
+        int level;              /* Family, or 64 for x86-64 */
+        int model;
+        u32 flags[NCAPINTS];
+};
+static struct cpu_features cpu;
+static u32 cpu_vendor[3];
+static u32 err_flags[NCAPINTS];
+#ifdef CONFIG_X86_64
+static const int req_level = 64;
+#elif defined(CONFIG_X86_MINIMUM_CPU_FAMILY)
+static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY;
+#else
+static const int req_level = 3;
+#endif
+static const u32 req_flags[NCAPINTS] =
+{
+        REQUIRED_MASK0,
+        REQUIRED_MASK1,
+        REQUIRED_MASK2,
+        REQUIRED_MASK3,
+        REQUIRED_MASK4,
+        REQUIRED_MASK5,
+        REQUIRED_MASK6,
+        REQUIRED_MASK7,
+};
+#define A32(a,b,c,d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
+static int is_amd(void)
+{
+        return cpu_vendor[0] == A32('A','u','t','h') &&
+               cpu_vendor[1] == A32('e','n','t','i') &&
+               cpu_vendor[2] == A32('c','A','M','D');
+}
+static int is_centaur(void)
+{
+        return cpu_vendor[0] == A32('C','e','n','t') &&
+               cpu_vendor[1] == A32('a','u','r','H') &&
+               cpu_vendor[2] == A32('a','u','l','s');
+}
+static int is_transmeta(void)
+{
+        return cpu_vendor[0] == A32('G','e','n','u') &&
+               cpu_vendor[1] == A32('i','n','e','T') &&
+               cpu_vendor[2] == A32('M','x','8','6');
+}
+static int has_fpu(void)
+{
+        u16 fcw = -1, fsw = -1;
+        u32 cr0;
+        asm("movl %%cr0,%0" : "=r" (cr0));
+        if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
+                cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
+                asm volatile("movl %0,%%cr0" : : "r" (cr0));
+        }
+        asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+                     : "+m" (fsw), "+m" (fcw));
+        return fsw == 0 && (fcw & 0x103f) == 0x003f;
+}
+static int has_eflag(u32 mask)
+{
+        u32 f0, f1;
+        asm("pushfl ; "
+            "pushfl ; "
+            "popl %0 ; "
+            "movl %0,%1 ; "
+            "xorl %2,%1 ; "
+            "pushl %1 ; "
+            "popfl ; "
+            "pushfl ; "
+            "popl %1 ; "
+            "popfl"
+            : "=&r" (f0), "=&r" (f1)
+            : "ri" (mask));
+        return !!((f0^f1) & mask);
+}
+static void get_flags(void)
+{
+        u32 max_intel_level, max_amd_level;
+        u32 tfms;
+        if (has_fpu())
+                set_bit(X86_FEATURE_FPU, cpu.flags);
+        if (has_eflag(X86_EFLAGS_ID)) {
+                asm("cpuid"
+                    : "=a" (max_intel_level),
+                      "=b" (cpu_vendor[0]),
+                      "=d" (cpu_vendor[1]),
+                      "=c" (cpu_vendor[2])
+                    : "a" (0));
+                if (max_intel_level >= 0x00000001 &&
+                    max_intel_level <= 0x0000ffff) {
+                        asm("cpuid"
+                            : "=a" (tfms),
+                              "=c" (cpu.flags[4]),
+                              "=d" (cpu.flags[0])
+                            : "a" (0x00000001)
+                            : "ebx");
+                        cpu.level = (tfms >> 8) & 15;
+                        cpu.model = (tfms >> 4) & 15;
+                        if (cpu.level >= 6)
+                                cpu.model += ((tfms >> 16) & 0xf) << 4;
+                }
+                asm("cpuid"
+                    : "=a" (max_amd_level)
+                    : "a" (0x80000000)
+                    : "ebx", "ecx", "edx");
+                if (max_amd_level >= 0x80000001 &&
+                    max_amd_level <= 0x8000ffff) {
+                        u32 eax = 0x80000001;
+                        asm("cpuid"
+                            : "+a" (eax),
+                              "=c" (cpu.flags[6]),
+                              "=d" (cpu.flags[1])
+                            : : "ebx");
+                }
+        }
+}
+/* Returns a bitmask of which words we have error bits in */
+static int check_flags(void)
+{
+        u32 err;
+        int i;
+        err = 0;
+        for (i = 0; i < NCAPINTS; i++) {
+                err_flags[i] = req_flags[i] & ~cpu.flags[i];
+                if (err_flags[i])
+                        err |= 1 << i;
+        }
+        return err;
+}
+/*
+ * Returns -1 on error.
+ *
+ * *cpu_level is set to the current CPU level; *req_level to the required
+ * level.  x86-64 is considered level 64 for this purpose.
+ *
+ * *err_flags_ptr is set to the flags error array if there are flags missing.
+ */
+int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
+{
+        int err;
+        memset(&cpu.flags, 0, sizeof cpu.flags);
+        cpu.level = 3;
+        if (has_eflag(X86_EFLAGS_AC))
+                cpu.level = 4;
+        get_flags();
+        err = check_flags();
+        if (test_bit(X86_FEATURE_LM, cpu.flags))
+                cpu.level = 64;
+        if (err == 0x01 &&
+            !(err_flags[0] &
+              ~((1 << X86_FEATURE_XMM)|(1 << X86_FEATURE_XMM2))) &&
+            is_amd()) {
+                /* If this is an AMD and we're only missing SSE+SSE2, try to
+                   turn them on */
+                u32 ecx = MSR_K7_HWCR;
+                u32 eax, edx;
+                asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
+                eax &= ~(1 << 15);
+                asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+                get_flags();    /* Make sure it really did something */
+                err = check_flags();
+        } else if (err == 0x01 &&
+                   !(err_flags[0] & ~(1 << X86_FEATURE_CX8)) &&
+                   is_centaur() && cpu.model >= 6) {
+                /* If this is a VIA C3, we might have to enable CX8
+                   explicitly */
+                u32 ecx = MSR_VIA_FCR;
+                u32 eax, edx;
+                asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
+                eax |= (1<<1)|(1<<7);
+                asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+                set_bit(X86_FEATURE_CX8, cpu.flags);
+                err = check_flags();
+        } else if (err == 0x01 && is_transmeta()) {
+                /* Transmeta might have masked feature bits in word 0 */
+                u32 ecx = 0x80860004;
+                u32 eax, edx;
+                u32 level = 1;
+                asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
+                asm("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx));
+                asm("cpuid"
+                    : "+a" (level), "=d" (cpu.flags[0])
+                    : : "ecx", "ebx");
+                asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+                err = check_flags();
+        }
+        if (err_flags_ptr)
+                *err_flags_ptr = err ? err_flags : NULL;
+        if (cpu_level_ptr)
+                *cpu_level_ptr = cpu.level;
+        if (req_level_ptr)
+                *req_level_ptr = req_level;
+        return (cpu.level < req_level || err) ? -1 : 0;
+}
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
new file mode 100644
index 000000000000..bd138e442ec2
--- /dev/null
+++ b/arch/x86/boot/edd.c
@@ -0,0 +1,167 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/edd.c
+ *
+ * Get EDD BIOS disk information
+ */
+#include "boot.h"
+#include <linux/edd.h>
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+/*
+ * Read the MBR (first sector) from a specific device.
+ */
+static int read_mbr(u8 devno, void *buf)
+{
+        u16 ax, bx, cx, dx;
+        ax = 0x0201;            /* Legacy Read, one sector */
+        cx = 0x0001;            /* Sector 0-0-1 */
+        dx = devno;
+        bx = (size_t)buf;
+        asm volatile("pushfl; stc; int $0x13; setc %%al; popfl"
+                     : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx)
+                     : : "esi", "edi", "memory");
+        return -(u8)ax;         /* 0 or -1 */
+}
+static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
+{
+        int sector_size;
+        char *mbrbuf_ptr, *mbrbuf_end;
+        u32 buf_base, mbr_base;
+        extern char _end[];
+        sector_size = ei->params.bytes_per_sector;
+        if (!sector_size)
+                sector_size = 512; /* Best available guess */
+        /* Produce a naturally aligned buffer on the heap */
+        buf_base = (ds() << 4) + (u32)&_end;
+        mbr_base = (buf_base+sector_size-1) & ~(sector_size-1);
+        mbrbuf_ptr = _end + (mbr_base-buf_base);
+        mbrbuf_end = mbrbuf_ptr + sector_size;
+        /* Make sure we actually have space on the heap... */
+        if (!(boot_params.hdr.loadflags & CAN_USE_HEAP))
+                return -1;
+        if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr)
+                return -1;
+        if (read_mbr(devno, mbrbuf_ptr))
+                return -1;
+        *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET];
+        return 0;
+}
+static int get_edd_info(u8 devno, struct edd_info *ei)
+{
+        u16 ax, bx, cx, dx, di;
+        memset(ei, 0, sizeof *ei);
+        /* Check Extensions Present */
+        ax = 0x4100;
+        bx = EDDMAGIC1;
+        dx = devno;
+        asm("pushfl; stc; int $0x13; setc %%al; popfl"
+            : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx)
+            : : "esi", "edi");
+        if ((u8)ax)
+                return -1;      /* No extended information */
+        if (bx != EDDMAGIC2)
+                return -1;
+        ei->device  = devno;
+        ei->version = ax >> 8;  /* EDD version number */
+        ei->interface_support = cx; /* EDD functionality subsets */
+        /* Extended Get Device Parameters */
+        ei->params.length = sizeof(ei->params);
+        ax = 0x4800;
+        dx = devno;
+        asm("pushfl; int $0x13; popfl"
+            : "+a" (ax), "+d" (dx), "=m" (ei->params)
+            : "S" (&ei->params)
+            : "ebx", "ecx", "edi");
+        /* Get legacy CHS parameters */
+        /* Ralf Brown recommends setting ES:DI to 0:0 */
+        ax = 0x0800;
+        dx = devno;
+        di = 0;
+        asm("pushw %%es; "
+            "movw %%di,%%es; "
+            "pushfl; stc; int $0x13; setc %%al; popfl; "
+            "popw %%es"
+            : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di)
+            : : "esi");
+        if ((u8)ax == 0) {
+                ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2);
+                ei->legacy_max_head = dx >> 8;
+                ei->legacy_sectors_per_track = cx & 0x3f;
+        }
+        return 0;
+}
+void query_edd(void)
+{
+        char eddarg[8];
+        int do_mbr = 1;
+        int do_edd = 1;
+        int devno;
+        struct edd_info ei, *edp;
+        u32 *mbrptr;
+        if (cmdline_find_option("edd", eddarg, sizeof eddarg) > 0) {
+                if (!strcmp(eddarg, "skipmbr") || !strcmp(eddarg, "skip"))
+                        do_mbr = 0;
+                else if (!strcmp(eddarg, "off"))
+                        do_edd = 0;
+        }
+        edp    = boot_params.eddbuf;
+        mbrptr = boot_params.edd_mbr_sig_buffer;
+        if (!do_edd)
+                return;
+        for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) {
+                /*
+                 * Scan the BIOS-supported hard disks and query EDD
+                 * information...
+                 */
+                get_edd_info(devno, &ei);
+                if (boot_params.eddbuf_entries < EDDMAXNR) {
+                        memcpy(edp, &ei, sizeof ei);
+                        edp++;
+                        boot_params.eddbuf_entries++;
+                }
+                if (do_mbr && !read_mbr_sig(devno, &ei, mbrptr++))
+                        boot_params.edd_mbr_sig_buf_entries = devno-0x80+1;
+        }
+}
+#endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
new file mode 100644
index 000000000000..f3140e596d40
--- /dev/null
+++ b/arch/x86/boot/header.S
@@ -0,0 +1,283 @@
+/*
+ *      header.S
+ *
+ *      Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ *      Based on bootsect.S and setup.S
+ *      modified by more people than can be counted
+ *
+ *      Rewritten as a common file by H. Peter Anvin (Apr 2007)
+ *
+ * BIG FAT NOTE: We're in real mode using 64k segments.  Therefore segment
+ * addresses must be multiplied by 16 to obtain their respective linear
+ * addresses. To avoid confusion, linear addresses are written using leading
+ * hex while segment addresses are written as segment:offset.
+ *
+ */
+#include <asm/segment.h>
+#include <linux/utsrelease.h>
+#include <asm/boot.h>
+#include <asm/e820.h>
+#include <asm/page.h>
+#include <asm/setup.h>
+#include "boot.h"
+SETUPSECTS      = 4                     /* default nr of setup-sectors */
+BOOTSEG         = 0x07C0                /* original address of boot-sector */
+SYSSEG          = DEF_SYSSEG            /* system loaded at 0x10000 (65536) */
+SYSSIZE         = DEF_SYSSIZE           /* system size: # of 16-byte clicks */
+                                        /* to be loaded */
+ROOT_DEV        = 0                     /* ROOT_DEV is now written by "build" */
+SWAP_DEV        = 0                     /* SWAP_DEV is now written by "build" */
+#ifndef SVGA_MODE
+#define SVGA_MODE ASK_VGA
+#endif
+#ifndef RAMDISK
+#define RAMDISK 0
+#endif
+#ifndef ROOT_RDONLY
+#define ROOT_RDONLY 1
+#endif
+        .code16
+        .section ".bstext", "ax"
+        .global bootsect_start
+bootsect_start:
+        # Normalize the start address
+        ljmp    $BOOTSEG, $start2
+start2:
+        movw    %cs, %ax
+        movw    %ax, %ds
+        movw    %ax, %es
+        movw    %ax, %ss
+        xorw    %sp, %sp
+        sti
+        cld
+        movw    $bugger_off_msg, %si
+msg_loop:
+        lodsb
+        andb    %al, %al
+        jz      bs_die
+        movb    $0xe, %ah
+        movw    $7, %bx
+        int     $0x10
+        jmp     msg_loop
+bs_die:
+        # Allow the user to press a key, then reboot
+        xorw    %ax, %ax
+        int     $0x16
+        int     $0x19
+        # int 0x19 should never return.  In case it does anyway,
+        # invoke the BIOS reset code...
+        ljmp    $0xf000,$0xfff0
+        .section ".bsdata", "a"
+bugger_off_msg:
+        .ascii  "Direct booting from floppy is no longer supported.\r\n"
+        .ascii  "Please use a boot loader program instead.\r\n"
+        .ascii  "\n"
+        .ascii  "Remove disk and press any key to reboot . . .\r\n"
+        .byte   0
+        # Kernel attributes; used by setup.  This is part 1 of the
+        # header, from the old boot sector.
+        .section ".header", "a"
+        .globl  hdr
+hdr:
+setup_sects:    .byte SETUPSECTS
+root_flags:     .word ROOT_RDONLY
+syssize:        .long SYSSIZE
+ram_size:       .word RAMDISK
+vid_mode:       .word SVGA_MODE
+root_dev:       .word ROOT_DEV
+boot_flag:      .word 0xAA55
+        # offset 512, entry point
+        .globl  _start
+_start:
+                # Explicitly enter this as bytes, or the assembler
+                # tries to generate a 3-byte jump here, which causes
+                # everything else to push off to the wrong offset.
+                .byte   0xeb            # short (2-byte) jump
+                .byte   start_of_setup-1f
+1:
+        # Part 2 of the header, from the old setup.S
+                .ascii  "HdrS"          # header signature
+                .word   0x0206          # header version number (>= 0x0105)
+                                        # or else old loadlin-1.5 will fail)
+                .globl realmode_swtch
+realmode_swtch: .word   0, 0            # default_switch, SETUPSEG
+start_sys_seg:  .word   SYSSEG
+                .word   kernel_version-512 # pointing to kernel version string
+                                        # above section of header is compatible
+                                        # with loadlin-1.5 (header v1.5). Don't
+                                        # change it.
+type_of_loader: .byte   0               # = 0, old one (LILO, Loadlin,
+                                        #      Bootlin, SYSLX, bootsect...)
+                                        # See Documentation/i386/boot.txt for
+                                        # assigned ids
+# flags, unused bits must be zero (RFU) bit within loadflags
+loadflags:
+LOADED_HIGH     = 1                     # If set, the kernel is loaded high
+CAN_USE_HEAP    = 0x80                  # If set, the loader also has set
+                                        # heap_end_ptr to tell how much
+                                        # space behind setup.S can be used for
+                                        # heap purposes.
+                                        # Only the loader knows what is free
+#ifndef __BIG_KERNEL__
+                .byte   0
+#else
+                .byte   LOADED_HIGH
+#endif
+setup_move_size: .word  0x8000          # size to move, when setup is not
+                                        # loaded at 0x90000. We will move setup
+                                        # to 0x90000 then just before jumping
+                                        # into the kernel. However, only the
+                                        # loader knows how much data behind
+                                        # us also needs to be loaded.
+code32_start:                           # here loaders can put a different
+                                        # start address for 32-bit code.
+#ifndef __BIG_KERNEL__
+                .long   0x1000          #   0x1000 = default for zImage
+#else
+                .long   0x100000        # 0x100000 = default for big kernel
+#endif
+ramdisk_image:  .long   0               # address of loaded ramdisk image
+                                        # Here the loader puts the 32-bit
+                                        # address where it loaded the image.
+                                        # This only will be read by the kernel.
+ramdisk_size:   .long   0               # its size in bytes
+bootsect_kludge:
+                .long   0               # obsolete
+heap_end_ptr:   .word   _end+1024       # (Header version 0x0201 or later)
+                                        # space from here (exclusive) down to
+                                        # end of setup code can be used by setup
+                                        # for local heap purposes.
+pad1:           .word   0
+cmd_line_ptr:   .long   0               # (Header version 0x0202 or later)
+                                        # If nonzero, a 32-bit pointer
+                                        # to the kernel command line.
+                                        # The command line should be
+                                        # located between the start of
+                                        # setup and the end of low
+                                        # memory (0xa0000), or it may
+                                        # get overwritten before it
+                                        # gets read.  If this field is
+                                        # used, there is no longer
+                                        # anything magical about the
+                                        # 0x90000 segment; the setup
+                                        # can be located anywhere in
+                                        # low memory 0x10000 or higher.
+ramdisk_max:    .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff
+                                        # (Header version 0x0203 or later)
+                                        # The highest safe address for
+                                        # the contents of an initrd
+kernel_alignment:  .long CONFIG_PHYSICAL_ALIGN  #physical addr alignment
+                                                #required for protected mode
+                                                #kernel
+#ifdef CONFIG_RELOCATABLE
+relocatable_kernel:    .byte 1
+#else
+relocatable_kernel:    .byte 0
+#endif
+pad2:                   .byte 0
+pad3:                   .word 0
+cmdline_size:   .long   COMMAND_LINE_SIZE-1     #length of the command line,
+                                                #added with boot protocol
+                                                #version 2.06
+# End of setup header #####################################################
+        .section ".inittext", "ax"
+start_of_setup:
+#ifdef SAFE_RESET_DISK_CONTROLLER
+# Reset the disk controller.
+        movw    $0x0000, %ax            # Reset disk controller
+        movb    $0x80, %dl              # All disks
+        int     $0x13
+#endif
+# We will have entered with %cs = %ds+0x20, normalize %cs so
+# it is on par with the other segments.
+        pushw   %ds
+        pushw   $setup2
+        lretw
+setup2:
+# Force %es = %ds
+        movw    %ds, %ax
+        movw    %ax, %es
+        cld
+# Stack paranoia: align the stack and make sure it is good
+# for both 16- and 32-bit references.  In particular, if we
+# were meant to have been using the full 16-bit segment, the
+# caller might have set %sp to zero, which breaks %esp-based
+# references.
+        andw    $~3, %sp        # dword align (might as well...)
+        jnz     1f
+        movw    $0xfffc, %sp    # Make sure we're not zero
+1:      movzwl  %sp, %esp       # Clear upper half of %esp
+        sti
+# Check signature at end of setup
+        cmpl    $0x5a5aaa55, setup_sig
+        jne     setup_bad
+# Zero the bss
+        movw    $__bss_start, %di
+        movw    $_end+3, %cx
+        xorl    %eax, %eax
+        subw    %di, %cx
+        shrw    $2, %cx
+        rep; stosl
+# Jump to C code (should not return)
+        calll   main
+# Setup corrupt somehow...
+setup_bad:
+        movl    $setup_corrupt, %eax
+        calll   puts
+        # Fall through...
+        .globl  die
+        .type   die, @function
+die:
+        hlt
+        jmp     die
+        .size   die, .-die
+        .section ".initdata", "a"
+setup_corrupt:
+        .byte   7
+        .string "No setup signature found...\n"
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh
new file mode 100644
index 000000000000..88d77761d01b
--- /dev/null
+++ b/arch/x86/boot/install.sh
@@ -0,0 +1,61 @@
+#!/bin/sh
+#
+# arch/i386/boot/install.sh
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 1995 by Linus Torvalds
+#
+# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin
+#
+# "make install" script for i386 architecture
+#
+# Arguments:
+#   $1 - kernel version
+#   $2 - kernel image file
+#   $3 - kernel map file
+#   $4 - default install path (blank if root directory)
+#
+verify () {
+        if [ ! -f "$1" ]; then
+                echo ""                                                   1>&2
+                echo " *** Missing file: $1"                              1>&2
+                echo ' *** You need to run "make" before "make install".' 1>&2
+                echo ""                                                   1>&2
+                exit 1
+        fi
+}
+# Make sure the files actually exist
+verify "$2"
+verify "$3"
+# User may have a custom install script
+if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
+if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
+# Default install - same as make zlilo
+if [ -f $4/vmlinuz ]; then
+        mv $4/vmlinuz $4/vmlinuz.old
+fi
+if [ -f $4/System.map ]; then
+        mv $4/System.map $4/System.old
+fi
+cat $2 > $4/vmlinuz
+cp $3 $4/System.map
+if [ -x /sbin/lilo ]; then
+       /sbin/lilo
+elif [ -x /etc/lilo/install ]; then
+       /etc/lilo/install
+else
+       sync
+       echo "Cannot find LILO."
+fi
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
new file mode 100644
index 000000000000..0eeef3989a17
--- /dev/null
+++ b/arch/x86/boot/main.c
@@ -0,0 +1,161 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/main.c
+ *
+ * Main module for the real-mode kernel code
+ */
+#include "boot.h"
+struct boot_params boot_params __attribute__((aligned(16)));
+char *HEAP = _end;
+char *heap_end = _end;          /* Default end of heap = no heap */
+/*
+ * Copy the header into the boot parameter block.  Since this
+ * screws up the old-style command line protocol, adjust by
+ * filling in the new-style command line pointer instead.
+ */
+#define OLD_CL_MAGIC    0xA33F
+#define OLD_CL_ADDRESS  0x20
+static void copy_boot_params(void)
+{
+        struct old_cmdline {
+                u16 cl_magic;
+                u16 cl_offset;
+        };
+        const struct old_cmdline * const oldcmd =
+                (const struct old_cmdline *)OLD_CL_ADDRESS;
+        BUILD_BUG_ON(sizeof boot_params != 4096);
+        memcpy(&boot_params.hdr, &hdr, sizeof hdr);
+        if (!boot_params.hdr.cmd_line_ptr &&
+            oldcmd->cl_magic == OLD_CL_MAGIC) {
+                /* Old-style command line protocol. */
+                u16 cmdline_seg;
+                /* Figure out if the command line falls in the region
+                   of memory that an old kernel would have copied up
+                   to 0x90000... */
+                if (oldcmd->cl_offset < boot_params.hdr.setup_move_size)
+                        cmdline_seg = ds();
+                else
+                        cmdline_seg = 0x9000;
+                boot_params.hdr.cmd_line_ptr =
+                        (cmdline_seg << 4) + oldcmd->cl_offset;
+        }
+}
+/*
+ * Set the keyboard repeat rate to maximum.  Unclear why this
+ * is done here; this might be possible to kill off as stale code.
+ */
+static void keyboard_set_repeat(void)
+{
+        u16 ax = 0x0305;
+        u16 bx = 0;
+        asm volatile("int $0x16"
+                     : "+a" (ax), "+b" (bx)
+                     : : "ecx", "edx", "esi", "edi");
+}
+/*
+ * Get Intel SpeedStep (IST) information.
+ */
+static void query_ist(void)
+{
+        asm("int $0x15"
+            : "=a" (boot_params.ist_info.signature),
+              "=b" (boot_params.ist_info.command),
+              "=c" (boot_params.ist_info.event),
+              "=d" (boot_params.ist_info.perf_level)
+            : "a" (0x0000e980),  /* IST Support */
+              "d" (0x47534943)); /* Request value */
+}
+/*
+ * Tell the BIOS what CPU mode we intend to run in.
+ */
+static void set_bios_mode(void)
+{
+#ifdef CONFIG_X86_64
+        u32 eax, ebx;
+        eax = 0xec00;
+        ebx = 2;
+        asm volatile("int $0x15"
+                     : "+a" (eax), "+b" (ebx)
+                     : : "ecx", "edx", "esi", "edi");
+#endif
+}
+void main(void)
+{
+        /* First, copy the boot header into the "zeropage" */
+        copy_boot_params();
+        /* End of heap check */
+        if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
+                heap_end = (char *)(boot_params.hdr.heap_end_ptr
+                                    +0x200-STACK_SIZE);
+        } else {
+                /* Boot protocol 2.00 only, no heap available */
+                puts("WARNING: Ancient bootloader, some functionality "
+                     "may be limited!\n");
+        }
+        /* Make sure we have all the proper CPU support */
+        if (validate_cpu()) {
+                puts("Unable to boot - please use a kernel appropriate "
+                     "for your CPU.\n");
+                die();
+        }
+        /* Tell the BIOS what CPU mode we intend to run in. */
+        set_bios_mode();
+        /* Detect memory layout */
+        detect_memory();
+        /* Set keyboard repeat rate (why?) */
+        keyboard_set_repeat();
+        /* Set the video mode */
+        set_video();
+        /* Query MCA information */
+        query_mca();
+        /* Voyager */
+#ifdef CONFIG_X86_VOYAGER
+        query_voyager();
+#endif
+        /* Query Intel SpeedStep (IST) information */
+        query_ist();
+        /* Query APM information */
+#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
+        query_apm_bios();
+#endif
+        /* Query EDD information */
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+        query_edd();
+#endif
+        /* Do the last things and invoke protected mode */
+        go_to_protected_mode();
+}
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
new file mode 100644
index 000000000000..68222f2d4b67
--- /dev/null
+++ b/arch/x86/boot/mca.c
@@ -0,0 +1,43 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/mca.c
+ *
+ * Get the MCA system description table
+ */
+#include "boot.h"
+int query_mca(void)
+{
+        u8 err;
+        u16 es, bx, len;
+        asm("pushw %%es ; "
+            "int $0x15 ; "
+            "setc %0 ; "
+            "movw %%es, %1 ; "
+            "popw %%es"
+            : "=acd" (err), "=acdSD" (es), "=b" (bx)
+            : "a" (0xc000));
+        if (err)
+                return -1;      /* No MCA present */
+        set_fs(es);
+        len = rdfs16(bx);
+        if (len > sizeof(boot_params.sys_desc_table))
+                len = sizeof(boot_params.sys_desc_table);
+        copy_from_fs(&boot_params.sys_desc_table, bx, len);
+        return 0;
+}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
new file mode 100644
index 000000000000..378353956b5d
--- /dev/null
+++ b/arch/x86/boot/memory.c
@@ -0,0 +1,118 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/memory.c
+ *
+ * Memory detection code
+ */
+#include "boot.h"
+#define SMAP    0x534d4150      /* ASCII "SMAP" */
+static int detect_memory_e820(void)
+{
+        int count = 0;
+        u32 next = 0;
+        u32 size, id;
+        u8 err;
+        struct e820entry *desc = boot_params.e820_map;
+        do {
+                size = sizeof(struct e820entry);
+                /* Important: %edx is clobbered by some BIOSes,
+                   so it must be either used for the error output
+                   or explicitly marked clobbered. */
+                asm("int $0x15; setc %0"
+                    : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
+                      "=m" (*desc)
+                    : "D" (desc), "d" (SMAP), "a" (0xe820));
+                /* Some BIOSes stop returning SMAP in the middle of
+                   the search loop.  We don't know exactly how the BIOS
+                   screwed up the map at that point, we might have a
+                   partial map, the full map, or complete garbage, so
+                   just return failure. */
+                if (id != SMAP) {
+                        count = 0;
+                        break;
+                }
+                if (err)
+                        break;
+                count++;
+                desc++;
+        } while (next && count < E820MAX);
+        return boot_params.e820_entries = count;
+}
+static int detect_memory_e801(void)
+{
+        u16 ax, bx, cx, dx;
+        u8 err;
+        bx = cx = dx = 0;
+        ax = 0xe801;
+        asm("stc; int $0x15; setc %0"
+            : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx));
+        if (err)
+                return -1;
+        /* Do we really need to do this? */
+        if (cx || dx) {
+                ax = cx;
+                bx = dx;
+        }
+        if (ax > 15*1024)
+                return -1;      /* Bogus! */
+        /* This ignores memory above 16MB if we have a memory hole
+           there.  If someone actually finds a machine with a memory
+           hole at 16MB and no support for 0E820h they should probably
+           generate a fake e820 map. */
+        boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax;
+        return 0;
+}
+static int detect_memory_88(void)
+{
+        u16 ax;
+        u8 err;
+        ax = 0x8800;
+        asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax));
+        boot_params.screen_info.ext_mem_k = ax;
+        return -err;
+}
+int detect_memory(void)
+{
+        int err = -1;
+        if (detect_memory_e820() > 0)
+                err = 0;
+        if (!detect_memory_e801())
+                err = 0;
+        if (!detect_memory_88())
+                err = 0;
+        return err;
+}
diff --git a/arch/x86/boot/mtools.conf.in b/arch/x86/boot/mtools.conf.in
new file mode 100644
index 000000000000..efd6d2490c1d
--- /dev/null
+++ b/arch/x86/boot/mtools.conf.in
@@ -0,0 +1,17 @@
+#
+# mtools configuration file for "make (b)zdisk"
+#
+# Actual floppy drive
+drive a:
+  file="/dev/fd0"
+# 1.44 MB floppy disk image
+drive v:
+  file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=18 filter
+# 2.88 MB floppy disk image (mostly for virtual uses)
+drive w:
+  file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
new file mode 100644
index 000000000000..09fb342cc62e
--- /dev/null
+++ b/arch/x86/boot/pm.c
@@ -0,0 +1,174 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/pm.c
+ *
+ * Prepare the machine for transition to protected mode.
+ */
+#include "boot.h"
+#include <asm/segment.h>
+/*
+ * Invoke the realmode switch hook if present; otherwise
+ * disable all interrupts.
+ */
+static void realmode_switch_hook(void)
+{
+        if (boot_params.hdr.realmode_swtch) {
+                asm volatile("lcallw *%0"
+                             : : "m" (boot_params.hdr.realmode_swtch)
+                             : "eax", "ebx", "ecx", "edx");
+        } else {
+                asm volatile("cli");
+                outb(0x80, 0x70); /* Disable NMI */
+                io_delay();
+        }
+}
+/*
+ * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000.
+ * A bzImage kernel is loaded and runs at 0x100000.
+ */
+static void move_kernel_around(void)
+{
+        /* Note: rely on the compile-time option here rather than
+           the LOADED_HIGH flag.  The Qemu kernel loader unconditionally
+           sets the loadflags to zero. */
+#ifndef __BIG_KERNEL__
+        u16 dst_seg, src_seg;
+        u32 syssize;
+        dst_seg =  0x1000 >> 4;
+        src_seg = 0x10000 >> 4;
+        syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */
+        while (syssize) {
+                int paras  = (syssize >= 0x1000) ? 0x1000 : syssize;
+                int dwords = paras << 2;
+                asm volatile("pushw %%es ; "
+                             "pushw %%ds ; "
+                             "movw %1,%%es ; "
+                             "movw %2,%%ds ; "
+                             "xorw %%di,%%di ; "
+                             "xorw %%si,%%si ; "
+                             "rep;movsl ; "
+                             "popw %%ds ; "
+                             "popw %%es"
+                             : "+c" (dwords)
+                             : "r" (dst_seg), "r" (src_seg)
+                             : "esi", "edi");
+                syssize -= paras;
+                dst_seg += paras;
+                src_seg += paras;
+        }
+#endif
+}
+/*
+ * Disable all interrupts at the legacy PIC.
+ */
+static void mask_all_interrupts(void)
+{
+        outb(0xff, 0xa1);       /* Mask all interrupts on the secondary PIC */
+        io_delay();
+        outb(0xfb, 0x21);       /* Mask all but cascade on the primary PIC */
+        io_delay();
+}
+/*
+ * Reset IGNNE# if asserted in the FPU.
+ */
+static void reset_coprocessor(void)
+{
+        outb(0, 0xf0);
+        io_delay();
+        outb(0, 0xf1);
+        io_delay();
+}
+/*
+ * Set up the GDT
+ */
+#define GDT_ENTRY(flags,base,limit)             \
+        (((u64)(base & 0xff000000) << 32) |     \
+         ((u64)flags << 40) |                   \
+         ((u64)(limit & 0x00ff0000) << 32) |    \
+         ((u64)(base & 0x00ffff00) << 16) |     \
+         ((u64)(limit & 0x0000ffff)))
+struct gdt_ptr {
+        u16 len;
+        u32 ptr;
+} __attribute__((packed));
+static void setup_gdt(void)
+{
+        /* There are machines which are known to not boot with the GDT
+           being 8-byte unaligned.  Intel recommends 16 byte alignment. */
+        static const u64 boot_gdt[] __attribute__((aligned(16))) = {
+                /* CS: code, read/execute, 4 GB, base 0 */
+                [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
+                /* DS: data, read/write, 4 GB, base 0 */
+                [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
+        };
+        /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead
+           of the gdt_ptr contents.  Thus, make it static so it will
+           stay in memory, at least long enough that we switch to the
+           proper kernel GDT. */
+        static struct gdt_ptr gdt;
+        gdt.len = sizeof(boot_gdt)-1;
+        gdt.ptr = (u32)&boot_gdt + (ds() << 4);
+        asm volatile("lgdtl %0" : : "m" (gdt));
+}
+/*
+ * Set up the IDT
+ */
+static void setup_idt(void)
+{
+        static const struct gdt_ptr null_idt = {0, 0};
+        asm volatile("lidtl %0" : : "m" (null_idt));
+}
+/*
+ * Actual invocation sequence
+ */
+void go_to_protected_mode(void)
+{
+        /* Hook before leaving real mode, also disables interrupts */
+        realmode_switch_hook();
+        /* Move the kernel/setup to their final resting places */
+        move_kernel_around();
+        /* Enable the A20 gate */
+        if (enable_a20()) {
+                puts("A20 gate not responding, unable to boot...\n");
+                die();
+        }
+        /* Reset coprocessor (IGNNE#) */
+        reset_coprocessor();
+        /* Mask all interrupts in the PIC */
+        mask_all_interrupts();
+        /* Actual transition to protected mode... */
+        setup_idt();
+        setup_gdt();
+        protected_mode_jump(boot_params.hdr.code32_start,
+                            (u32)&boot_params + (ds() << 4));
+}
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
new file mode 100644
index 000000000000..2e559233725a
--- /dev/null
+++ b/arch/x86/boot/pmjump.S
@@ -0,0 +1,54 @@
+/* ----------------------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/pmjump.S
+ *
+ * The actual transition into protected mode
+ */
+#include <asm/boot.h>
+#include <asm/segment.h>
+        .text
+        .globl  protected_mode_jump
+        .type   protected_mode_jump, @function
+        .code16
+/*
+ * void protected_mode_jump(u32 entrypoint, u32 bootparams);
+ */
+protected_mode_jump:
+        xorl    %ebx, %ebx              # Flag to indicate this is a boot
+        movl    %edx, %esi              # Pointer to boot_params table
+        movl    %eax, 2f                # Patch ljmpl instruction
+        jmp     1f                      # Short jump to flush instruction q.
+1:
+        movw    $__BOOT_DS, %cx
+        movl    %cr0, %edx
+        orb     $1, %dl                 # Protected mode (PE) bit
+        movl    %edx, %cr0
+        movw    %cx, %ds
+        movw    %cx, %es
+        movw    %cx, %fs
+        movw    %cx, %gs
+        movw    %cx, %ss
+        # Jump to the 32-bit entrypoint
+        .byte   0x66, 0xea              # ljmpl opcode
+2:      .long   0                       # offset
+        .word   __BOOT_CS               # segment
+        .size   protected_mode_jump, .-protected_mode_jump
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
new file mode 100644
index 000000000000..1a09f9309d3c
--- /dev/null
+++ b/arch/x86/boot/printf.c
@@ -0,0 +1,307 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/printf.c
+ *
+ * Oh, it's a waste of space, but oh-so-yummy for debugging.  This
+ * version of printf() does not include 64-bit support.  "Live with
+ * it."
+ *
+ */
+#include "boot.h"
+static int skip_atoi(const char **s)
+{
+        int i = 0;
+        while (isdigit(**s))
+                i = i * 10 + *((*s)++) - '0';
+        return i;
+}
+#define ZEROPAD 1               /* pad with zero */
+#define SIGN    2               /* unsigned/signed long */
+#define PLUS    4               /* show plus */
+#define SPACE   8               /* space if plus */
+#define LEFT    16              /* left justified */
+#define SPECIAL 32              /* 0x */
+#define LARGE   64              /* use 'ABCDEF' instead of 'abcdef' */
+#define do_div(n,base) ({ \
+int __res; \
+__res = ((unsigned long) n) % (unsigned) base; \
+n = ((unsigned long) n) / (unsigned) base; \
+__res; })
+static char *number(char *str, long num, int base, int size, int precision,
+                    int type)
+{
+        char c, sign, tmp[66];
+        const char *digits = "0123456789abcdefghijklmnopqrstuvwxyz";
+        int i;
+        if (type & LARGE)
+                digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+        if (type & LEFT)
+                type &= ~ZEROPAD;
+        if (base < 2 || base > 36)
+                return 0;
+        c = (type & ZEROPAD) ? '0' : ' ';
+        sign = 0;
+        if (type & SIGN) {
+                if (num < 0) {
+                        sign = '-';
+                        num = -num;
+                        size--;
+                } else if (type & PLUS) {
+                        sign = '+';
+                        size--;
+                } else if (type & SPACE) {
+                        sign = ' ';
+                        size--;
+                }
+        }
+        if (type & SPECIAL) {
+                if (base == 16)
+                        size -= 2;
+                else if (base == 8)
+                        size--;
+        }
+        i = 0;
+        if (num == 0)
+                tmp[i++] = '0';
+        else
+                while (num != 0)
+                        tmp[i++] = digits[do_div(num, base)];
+        if (i > precision)
+                precision = i;
+        size -= precision;
+        if (!(type & (ZEROPAD + LEFT)))
+                while (size-- > 0)
+                        *str++ = ' ';
+        if (sign)
+                *str++ = sign;
+        if (type & SPECIAL) {
+                if (base == 8)
+                        *str++ = '0';
+                else if (base == 16) {
+                        *str++ = '0';
+                        *str++ = digits[33];
+                }
+        }
+        if (!(type & LEFT))
+                while (size-- > 0)
+                        *str++ = c;
+        while (i < precision--)
+                *str++ = '0';
+        while (i-- > 0)
+                *str++ = tmp[i];
+        while (size-- > 0)
+                *str++ = ' ';
+        return str;
+}
+int vsprintf(char *buf, const char *fmt, va_list args)
+{
+        int len;
+        unsigned long num;
+        int i, base;
+        char *str;
+        const char *s;
+        int flags;              /* flags to number() */
+        int field_width;        /* width of output field */
+        int precision;          /* min. # of digits for integers; max
+                                   number of chars for from string */
+        int qualifier;          /* 'h', 'l', or 'L' for integer fields */
+        for (str = buf; *fmt; ++fmt) {
+                if (*fmt != '%') {
+                        *str++ = *fmt;
+                        continue;
+                }
+                /* process flags */
+                flags = 0;
+              repeat:
+                ++fmt;          /* this also skips first '%' */
+                switch (*fmt) {
+                case '-':
+                        flags |= LEFT;
+                        goto repeat;
+                case '+':
+                        flags |= PLUS;
+                        goto repeat;
+                case ' ':
+                        flags |= SPACE;
+                        goto repeat;
+                case '#':
+                        flags |= SPECIAL;
+                        goto repeat;
+                case '0':
+                        flags |= ZEROPAD;
+                        goto repeat;
+                }
+                /* get field width */
+                field_width = -1;
+                if (isdigit(*fmt))
+                        field_width = skip_atoi(&fmt);
+                else if (*fmt == '*') {
+                        ++fmt;
+                        /* it's the next argument */
+                        field_width = va_arg(args, int);
+                        if (field_width < 0) {
+                                field_width = -field_width;
+                                flags |= LEFT;
+                        }
+                }
+                /* get the precision */
+                precision = -1;
+                if (*fmt == '.') {
+                        ++fmt;
+                        if (isdigit(*fmt))
+                                precision = skip_atoi(&fmt);
+                        else if (*fmt == '*') {
+                                ++fmt;
+                                /* it's the next argument */
+                                precision = va_arg(args, int);
+                        }
+                        if (precision < 0)
+                                precision = 0;
+                }
+                /* get the conversion qualifier */
+                qualifier = -1;
+                if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L') {
+                        qualifier = *fmt;
+                        ++fmt;
+                }
+                /* default base */
+                base = 10;
+                switch (*fmt) {
+                case 'c':
+                        if (!(flags & LEFT))
+                                while (--field_width > 0)
+                                        *str++ = ' ';
+                        *str++ = (unsigned char)va_arg(args, int);
+                        while (--field_width > 0)
+                                *str++ = ' ';
+                        continue;
+                case 's':
+                        s = va_arg(args, char *);
+                        len = strnlen(s, precision);
+                        if (!(flags & LEFT))
+                                while (len < field_width--)
+                                        *str++ = ' ';
+                        for (i = 0; i < len; ++i)
+                                *str++ = *s++;
+                        while (len < field_width--)
+                                *str++ = ' ';
+                        continue;
+                case 'p':
+                        if (field_width == -1) {
+                                field_width = 2 * sizeof(void *);
+                                flags |= ZEROPAD;
+                        }
+                        str = number(str,
+                                     (unsigned long)va_arg(args, void *), 16,
+                                     field_width, precision, flags);
+                        continue;
+                case 'n':
+                        if (qualifier == 'l') {
+                                long *ip = va_arg(args, long *);
+                                *ip = (str - buf);
+                        } else {
+                                int *ip = va_arg(args, int *);
+                                *ip = (str - buf);
+                        }
+                        continue;
+                case '%':
+                        *str++ = '%';
+                        continue;
+                        /* integer number formats - set up the flags and "break" */
+                case 'o':
+                        base = 8;
+                        break;
+                case 'X':
+                        flags |= LARGE;
+                case 'x':
+                        base = 16;
+                        break;
+                case 'd':
+                case 'i':
+                        flags |= SIGN;
+                case 'u':
+                        break;
+                default:
+                        *str++ = '%';
+                        if (*fmt)
+                                *str++ = *fmt;
+                        else
+                                --fmt;
+                        continue;
+                }
+                if (qualifier == 'l')
+                        num = va_arg(args, unsigned long);
+                else if (qualifier == 'h') {
+                        num = (unsigned short)va_arg(args, int);
+                        if (flags & SIGN)
+                                num = (short)num;
+                } else if (flags & SIGN)
+                        num = va_arg(args, int);
+                else
+                        num = va_arg(args, unsigned int);
+                str = number(str, num, base, field_width, precision, flags);
+        }
+        *str = '\0';
+        return str - buf;
+}
+int sprintf(char *buf, const char *fmt, ...)
+{
+        va_list args;
+        int i;
+        va_start(args, fmt);
+        i = vsprintf(buf, fmt, args);
+        va_end(args);
+        return i;
+}
+int printf(const char *fmt, ...)
+{
+        char printf_buf[1024];
+        va_list args;
+        int printed;
+        va_start(args, fmt);
+        printed = vsprintf(printf_buf, fmt, args);
+        va_end(args);
+        puts(printf_buf);
+        return printed;
+}
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
new file mode 100644
index 000000000000..df9234b3a5e0
--- /dev/null
+++ b/arch/x86/boot/setup.ld
@@ -0,0 +1,54 @@
+/*
+ * setup.ld
+ *
+ * Linker script for the i386 setup code
+ */
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+ENTRY(_start)
+SECTIONS
+{
+        . = 0;
+        .bstext         : { *(.bstext) }
+        .bsdata         : { *(.bsdata) }
+        . = 497;
+        .header         : { *(.header) }
+        .inittext       : { *(.inittext) }
+        .initdata       : { *(.initdata) }
+        .text           : { *(.text*) }
+        . = ALIGN(16);
+        .rodata         : { *(.rodata*) }
+        .videocards     : {
+                video_cards = .;
+                *(.videocards)
+                video_cards_end = .;
+        }
+        . = ALIGN(16);
+        .data           : { *(.data*) }
+        .signature      : {
+                setup_sig = .;
+                LONG(0x5a5aaa55)
+        }
+        . = ALIGN(16);
+        .bss            :
+        {
+                __bss_start = .;
+                *(.bss)
+                __bss_end = .;
+        }
+        . = ALIGN(16);
+        _end = .;
+        /DISCARD/ : { *(.note*) }
+        . = ASSERT(_end <= 0x8000, "Setup too big!");
+        . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
+}
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
new file mode 100644
index 000000000000..481a22097781
--- /dev/null
+++ b/arch/x86/boot/string.c
@@ -0,0 +1,52 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/string.c
+ *
+ * Very basic string functions
+ */
+#include "boot.h"
+int strcmp(const char *str1, const char *str2)
+{
+        const unsigned char *s1 = (const unsigned char *)str1;
+        const unsigned char *s2 = (const unsigned char *)str2;
+        int delta = 0;
+        while (*s1 || *s2) {
+                delta = *s2 - *s1;
+                if (delta)
+                        return delta;
+                s1++;
+                s2++;
+        }
+        return 0;
+}
+size_t strnlen(const char *s, size_t maxlen)
+{
+        const char *es = s;
+        while (*es && maxlen) {
+                es++;
+                maxlen--;
+        }
+        return (es - s);
+}
+unsigned int atou(const char *s)
+{
+        unsigned int i = 0;
+        while (isdigit(*s))
+                i = i * 10 + (*s++ - '0');
+        return i;
+}
diff --git a/arch/x86/boot/tools/.gitignore b/arch/x86/boot/tools/.gitignore
new file mode 100644
index 000000000000..378eac25d311
--- /dev/null
+++ b/arch/x86/boot/tools/.gitignore
@@ -0,0 +1 @@
+build
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
new file mode 100644
index 000000000000..b4248740ff0d
--- /dev/null
+++ b/arch/x86/boot/tools/build.c
@@ -0,0 +1,168 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 1997 Martin Mares
+ *  Copyright (C) 2007 H. Peter Anvin
+ */
+/*
+ * This file builds a disk-image from two different files:
+ *
+ * - setup: 8086 machine code, sets up system parm
+ * - system: 80386 code for actual system
+ *
+ * It does some checking that all files are of the correct type, and
+ * just writes the result to stdout, removing headers and padding to
+ * the right amount. It also writes some system data to stderr.
+ */
+/*
+ * Changes by tytso to allow root device specification
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ * Cross compiling fixes by Gertjan van Wingerde, July 1996
+ * Rewritten by Martin Mares, April 1997
+ * Substantially overhauled by H. Peter Anvin, April 2007
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <asm/boot.h>
+typedef unsigned char  u8;
+typedef unsigned short u16;
+typedef unsigned long  u32;
+#define DEFAULT_MAJOR_ROOT 0
+#define DEFAULT_MINOR_ROOT 0
+/* Minimal number of setup sectors */
+#define SETUP_SECT_MIN 5
+#define SETUP_SECT_MAX 64
+/* This must be large enough to hold the entire setup */
+u8 buf[SETUP_SECT_MAX*512];
+int is_big_kernel;
+static void die(const char * str, ...)
+{
+        va_list args;
+        va_start(args, str);
+        vfprintf(stderr, str, args);
+        fputc('\n', stderr);
+        exit(1);
+}
+static void usage(void)
+{
+        die("Usage: build [-b] setup system [rootdev] [> image]");
+}
+int main(int argc, char ** argv)
+{
+        unsigned int i, sz, setup_sectors;
+        int c;
+        u32 sys_size;
+        u8 major_root, minor_root;
+        struct stat sb;
+        FILE *file;
+        int fd;
+        void *kernel;
+        if (argc > 2 && !strcmp(argv[1], "-b"))
+          {
+            is_big_kernel = 1;
+            argc--, argv++;
+          }
+        if ((argc < 3) || (argc > 4))
+                usage();
+        if (argc > 3) {
+                if (!strcmp(argv[3], "CURRENT")) {
+                        if (stat("/", &sb)) {
+                                perror("/");
+                                die("Couldn't stat /");
+                        }
+                        major_root = major(sb.st_dev);
+                        minor_root = minor(sb.st_dev);
+                } else if (strcmp(argv[3], "FLOPPY")) {
+                        if (stat(argv[3], &sb)) {
+                                perror(argv[3]);
+                                die("Couldn't stat root device.");
+                        }
+                        major_root = major(sb.st_rdev);
+                        minor_root = minor(sb.st_rdev);
+                } else {
+                        major_root = 0;
+                        minor_root = 0;
+                }
+        } else {
+                major_root = DEFAULT_MAJOR_ROOT;
+                minor_root = DEFAULT_MINOR_ROOT;
+        }
+        fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root);
+        /* Copy the setup code */
+        file = fopen(argv[1], "r");
+        if (!file)
+                die("Unable to open `%s': %m", argv[1]);
+        c = fread(buf, 1, sizeof(buf), file);
+        if (ferror(file))
+                die("read-error on `setup'");
+        if (c < 1024)
+                die("The setup must be at least 1024 bytes");
+        if (buf[510] != 0x55 || buf[511] != 0xaa)
+                die("Boot block hasn't got boot flag (0xAA55)");
+        fclose(file);
+        /* Pad unused space with zeros */
+        setup_sectors = (c + 511) / 512;
+        if (setup_sectors < SETUP_SECT_MIN)
+                setup_sectors = SETUP_SECT_MIN;
+        i = setup_sectors*512;
+        memset(buf+c, 0, i-c);
+        /* Set the default root device */
+        buf[508] = minor_root;
+        buf[509] = major_root;
+        fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i);
+        /* Open and stat the kernel file */
+        fd = open(argv[2], O_RDONLY);
+        if (fd < 0)
+                die("Unable to open `%s': %m", argv[2]);
+        if (fstat(fd, &sb))
+                die("Unable to stat `%s': %m", argv[2]);
+        sz = sb.st_size;
+        fprintf (stderr, "System is %d kB\n", (sz+1023)/1024);
+        kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
+        if (kernel == MAP_FAILED)
+                die("Unable to mmap '%s': %m", argv[2]);
+        sys_size = (sz + 15) / 16;
+        if (!is_big_kernel && sys_size > DEF_SYSSIZE)
+                die("System is too big. Try using bzImage or modules.");
+        /* Patch the setup code with the appropriate size parameters */
+        buf[0x1f1] = setup_sectors-1;
+        buf[0x1f4] = sys_size;
+        buf[0x1f5] = sys_size >> 8;
+        buf[0x1f6] = sys_size >> 16;
+        buf[0x1f7] = sys_size >> 24;
+        if (fwrite(buf, 1, i, stdout) != i)
+                die("Writing setup failed");
+        /* Copy the kernel code */
+        if (fwrite(kernel, 1, sz, stdout) != sz)
+                die("Writing kernel failed");
+        close(fd);
+        /* Everything is OK */
+        return 0;
+}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
new file mode 100644
index 000000000000..f3f14bd26371
--- /dev/null
+++ b/arch/x86/boot/tty.c
@@ -0,0 +1,112 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/tty.c
+ *
+ * Very simple screen I/O
+ * XXX: Probably should add very simple serial I/O?
+ */
+#include "boot.h"
+/*
+ * These functions are in .inittext so they can be used to signal
+ * error during initialization.
+ */
+void __attribute__((section(".inittext"))) putchar(int ch)
+{
+        unsigned char c = ch;
+        if (c == '\n')
+                putchar('\r');  /* \n -> \r\n */
+        /* int $0x10 is known to have bugs involving touching registers
+           it shouldn't.  Be extra conservative... */
+        asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal"
+                     : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch));
+}
+void __attribute__((section(".inittext"))) puts(const char *str)
+{
+        int n = 0;
+        while (*str) {
+                putchar(*str++);
+                n++;
+        }
+}
+/*
+ * Read the CMOS clock through the BIOS, and return the
+ * seconds in BCD.
+ */
+static u8 gettime(void)
+{
+        u16 ax = 0x0200;
+        u16 cx, dx;
+        asm volatile("int $0x1a"
+                     : "+a" (ax), "=c" (cx), "=d" (dx)
+                     : : "ebx", "esi", "edi");
+        return dx >> 8;
+}
+/*
+ * Read from the keyboard
+ */
+int getchar(void)
+{
+        u16 ax = 0;
+        asm volatile("int $0x16" : "+a" (ax));
+        return ax & 0xff;
+}
+static int kbd_pending(void)
+{
+        u8 pending;
+        asm volatile("int $0x16; setnz %0"
+                     : "=rm" (pending)
+                     : "a" (0x0100));
+        return pending;
+}
+void kbd_flush(void)
+{
+        for (;;) {
+                if (!kbd_pending())
+                        break;
+                getchar();
+        }
+}
+int getchar_timeout(void)
+{
+        int cnt = 30;
+        int t0, t1;
+        t0 = gettime();
+        while (cnt) {
+                if (kbd_pending())
+                        return getchar();
+                t1 = gettime();
+                if (t0 != t1) {
+                        cnt--;
+                        t0 = t1;
+                }
+        }
+        return 0;               /* Timeout! */
+}
diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c
new file mode 100644
index 000000000000..c61462f7d9a7
--- /dev/null
+++ b/arch/x86/boot/version.c
@@ -0,0 +1,23 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/version.c
+ *
+ * Kernel version string
+ */
+#include "boot.h"
+#include <linux/utsrelease.h>
+#include <linux/compile.h>
+const char kernel_version[] =
+        UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") "
+        UTS_VERSION;
diff --git a/arch/x86/boot/vesa.h b/arch/x86/boot/vesa.h
new file mode 100644
index 000000000000..ff5b73cd406f
--- /dev/null
+++ b/arch/x86/boot/vesa.h
@@ -0,0 +1,79 @@
+/* ----------------------------------------------------------------------- *
+ *
+ *   Copyright 1999-2007 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+#ifndef BOOT_VESA_H
+#define BOOT_VESA_H
+typedef struct {
+        u16 off, seg;
+} far_ptr;
+/* VESA General Information table */
+struct vesa_general_info {
+        u32 signature;          /* 0 Magic number = "VESA" */
+        u16 version;            /* 4 */
+        far_ptr vendor_string;  /* 6 */
+        u32 capabilities;       /* 10 */
+        far_ptr video_mode_ptr; /* 14 */
+        u16 total_memory;       /* 18 */
+        u16 oem_software_rev;   /* 20 */
+        far_ptr oem_vendor_name_ptr;    /* 22 */
+        far_ptr oem_product_name_ptr;   /* 26 */
+        far_ptr oem_product_rev_ptr;    /* 30 */
+        u8 reserved[222];       /* 34 */
+        u8 oem_data[256];       /* 256 */
+} __attribute__ ((packed));
+#define VESA_MAGIC ('V' + ('E' << 8) + ('S' << 16) + ('A' << 24))
+#define VBE2_MAGIC ('V' + ('B' << 8) + ('E' << 16) + ('2' << 24))
+struct vesa_mode_info {
+        u16 mode_attr;          /* 0 */
+        u8 win_attr[2];         /* 2 */
+        u16 win_grain;          /* 4 */
+        u16 win_size;           /* 6 */
+        u16 win_seg[2];         /* 8 */
+        far_ptr win_scheme;     /* 12 */
+        u16 logical_scan;       /* 16 */
+        u16 h_res;              /* 18 */
+        u16 v_res;              /* 20 */
+        u8 char_width;          /* 22 */
+        u8 char_height;         /* 23 */
+        u8 memory_planes;       /* 24 */
+        u8 bpp;                 /* 25 */
+        u8 banks;               /* 26 */
+        u8 memory_layout;       /* 27 */
+        u8 bank_size;           /* 28 */
+        u8 image_planes;        /* 29 */
+        u8 page_function;       /* 30 */
+        u8 rmask;               /* 31 */
+        u8 rpos;                /* 32 */
+        u8 gmask;               /* 33 */
+        u8 gpos;                /* 34 */
+        u8 bmask;               /* 35 */
+        u8 bpos;                /* 36 */
+        u8 resv_mask;           /* 37 */
+        u8 resv_pos;            /* 38 */
+        u8 dcm_info;            /* 39 */
+        u32 lfb_ptr;            /* 40 Linear frame buffer address */
+        u32 offscreen_ptr;      /* 44 Offscreen memory address */
+        u16 offscreen_size;     /* 48 */
+        u8 reserved[206];       /* 50 */
+} __attribute__ ((packed));
+#endif                          /* LIB_SYS_VESA_H */
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
new file mode 100644
index 000000000000..68e65d95cdfd
--- /dev/null
+++ b/arch/x86/boot/video-bios.c
@@ -0,0 +1,125 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/video-bios.c
+ *
+ * Standard video BIOS modes
+ *
+ * We have two options for this; silent and scanned.
+ */
+#include "boot.h"
+#include "video.h"
+__videocard video_bios;
+/* Set a conventional BIOS mode */
+static int set_bios_mode(u8 mode);
+static int bios_set_mode(struct mode_info *mi)
+{
+        return set_bios_mode(mi->mode - VIDEO_FIRST_BIOS);
+}
+static int set_bios_mode(u8 mode)
+{
+        u16 ax;
+        u8 new_mode;
+        ax = mode;              /* AH=0x00 Set Video Mode */
+        asm volatile(INT10
+                     : "+a" (ax)
+                     : : "ebx", "ecx", "edx", "esi", "edi");
+        ax = 0x0f00;            /* Get Current Video Mode */
+        asm volatile(INT10
+                     : "+a" (ax)
+                     : : "ebx", "ecx", "edx", "esi", "edi");
+        do_restore = 1;         /* Assume video contents were lost */
+        new_mode = ax & 0x7f;   /* Not all BIOSes are clean with the top bit */
+        if (new_mode == mode)
+                return 0;       /* Mode change OK */
+        if (new_mode != boot_params.screen_info.orig_video_mode) {
+                /* Mode setting failed, but we didn't end up where we
+                   started.  That's bad.  Try to revert to the original
+                   video mode. */
+                ax = boot_params.screen_info.orig_video_mode;
+                asm volatile(INT10
+                             : "+a" (ax)
+                             : : "ebx", "ecx", "edx", "esi", "edi");
+        }
+        return -1;
+}
+static int bios_probe(void)
+{
+        u8 mode;
+        u8 saved_mode = boot_params.screen_info.orig_video_mode;
+        u16 crtc;
+        struct mode_info *mi;
+        int nmodes = 0;
+        if (adapter != ADAPTER_EGA && adapter != ADAPTER_VGA)
+                return 0;
+        set_fs(0);
+        crtc = vga_crtc();
+        video_bios.modes = GET_HEAP(struct mode_info, 0);
+        for (mode = 0x14; mode <= 0x7f; mode++) {
+                if (heap_free() < sizeof(struct mode_info))
+                        break;
+                if (mode_defined(VIDEO_FIRST_BIOS+mode))
+                        continue;
+                if (set_bios_mode(mode))
+                        continue;
+                /* Try to verify that it's a text mode. */
+                /* Attribute Controller: make graphics controller disabled */
+                if (in_idx(0x3c0, 0x10) & 0x01)
+                        continue;
+                /* Graphics Controller: verify Alpha addressing enabled */
+                if (in_idx(0x3ce, 0x06) & 0x01)
+                        continue;
+                /* CRTC cursor location low should be zero(?) */
+                if (in_idx(crtc, 0x0f))
+                        continue;
+                mi = GET_HEAP(struct mode_info, 1);
+                mi->mode = VIDEO_FIRST_BIOS+mode;
+                mi->x = rdfs16(0x44a);
+                mi->y = rdfs8(0x484)+1;
+                nmodes++;
+        }
+        set_bios_mode(saved_mode);
+        return nmodes;
+}
+__videocard video_bios =
+{
+        .card_name      = "BIOS (scanned)",
+        .probe          = bios_probe,
+        .set_mode       = bios_set_mode,
+        .unsafe         = 1,
+        .xmode_first    = VIDEO_FIRST_BIOS,
+        .xmode_n        = 0x80,
+};
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
new file mode 100644
index 000000000000..192190710710
--- /dev/null
+++ b/arch/x86/boot/video-vesa.c
@@ -0,0 +1,292 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/video-vesa.c
+ *
+ * VESA text modes
+ */
+#include "boot.h"
+#include "video.h"
+#include "vesa.h"
+/* VESA information */
+static struct vesa_general_info vginfo;
+static struct vesa_mode_info vminfo;
+__videocard video_vesa;
+static void vesa_store_mode_params_graphics(void);
+static int vesa_probe(void)
+{
+#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
+        u16 ax, cx, di;
+        u16 mode;
+        addr_t mode_ptr;
+        struct mode_info *mi;
+        int nmodes = 0;
+        video_vesa.modes = GET_HEAP(struct mode_info, 0);
+        vginfo.signature = VBE2_MAGIC;
+        ax = 0x4f00;
+        di = (size_t)&vginfo;
+        asm(INT10
+            : "+a" (ax), "+D" (di), "=m" (vginfo)
+            : : "ebx", "ecx", "edx", "esi");
+        if (ax != 0x004f ||
+            vginfo.signature != VESA_MAGIC ||
+            vginfo.version < 0x0102)
+                return 0;       /* Not present */
+#endif /* CONFIG_VIDEO_VESA || CONFIG_FIRMWARE_EDID */
+#ifdef CONFIG_VIDEO_VESA
+        set_fs(vginfo.video_mode_ptr.seg);
+        mode_ptr = vginfo.video_mode_ptr.off;
+        while ((mode = rdfs16(mode_ptr)) != 0xffff) {
+                mode_ptr += 2;
+                if (heap_free() < sizeof(struct mode_info))
+                        break;  /* Heap full, can't save mode info */
+                if (mode & ~0x1ff)
+                        continue;
+                memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
+                ax = 0x4f01;
+                cx = mode;
+                di = (size_t)&vminfo;
+                asm(INT10
+                    : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
+                    : : "ebx", "edx", "esi");
+                if (ax != 0x004f)
+                        continue;
+                if ((vminfo.mode_attr & 0x15) == 0x05) {
+                        /* Text Mode, TTY BIOS supported,
+                           supported by hardware */
+                        mi = GET_HEAP(struct mode_info, 1);
+                        mi->mode = mode + VIDEO_FIRST_VESA;
+                        mi->x    = vminfo.h_res;
+                        mi->y    = vminfo.v_res;
+                        nmodes++;
+                } else if ((vminfo.mode_attr & 0x99) == 0x99) {
+#ifdef CONFIG_FB
+                        /* Graphics mode, color, linear frame buffer
+                           supported -- register the mode but hide from
+                           the menu.  Only do this if framebuffer is
+                           configured, however, otherwise the user will
+                           be left without a screen. */
+                        mi = GET_HEAP(struct mode_info, 1);
+                        mi->mode = mode + VIDEO_FIRST_VESA;
+                        mi->x = mi->y = 0;
+                        nmodes++;
+#endif
+                }
+        }
+        return nmodes;
+#else
+        return 0;
+#endif /* CONFIG_VIDEO_VESA */
+}
+static int vesa_set_mode(struct mode_info *mode)
+{
+        u16 ax, bx, cx, di;
+        int is_graphic;
+        u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
+        memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
+        ax = 0x4f01;
+        cx = vesa_mode;
+        di = (size_t)&vminfo;
+        asm(INT10
+            : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
+            : : "ebx", "edx", "esi");
+        if (ax != 0x004f)
+                return -1;
+        if ((vminfo.mode_attr & 0x15) == 0x05) {
+                /* It's a supported text mode */
+                is_graphic = 0;
+        } else if ((vminfo.mode_attr & 0x99) == 0x99) {
+                /* It's a graphics mode with linear frame buffer */
+                is_graphic = 1;
+                vesa_mode |= 0x4000; /* Request linear frame buffer */
+        } else {
+                return -1;      /* Invalid mode */
+        }
+        ax = 0x4f02;
+        bx = vesa_mode;
+        di = 0;
+        asm volatile(INT10
+                     : "+a" (ax), "+b" (bx), "+D" (di)
+                     : : "ecx", "edx", "esi");
+        if (ax != 0x004f)
+                return -1;
+        graphic_mode = is_graphic;
+        if (!is_graphic) {
+                /* Text mode */
+                force_x = mode->x;
+                force_y = mode->y;
+                do_restore = 1;
+        } else {
+                /* Graphics mode */
+                vesa_store_mode_params_graphics();
+        }
+        return 0;
+}
+/* Switch DAC to 8-bit mode */
+static void vesa_dac_set_8bits(void)
+{
+        u8 dac_size = 6;
+        /* If possible, switch the DAC to 8-bit mode */
+        if (vginfo.capabilities & 1) {
+                u16 ax, bx;
+                ax = 0x4f08;
+                bx = 0x0800;
+                asm volatile(INT10
+                             : "+a" (ax), "+b" (bx)
+                             : : "ecx", "edx", "esi", "edi");
+                if (ax == 0x004f)
+                        dac_size = bx >> 8;
+        }
+        /* Set the color sizes to the DAC size, and offsets to 0 */
+        boot_params.screen_info.red_size = dac_size;
+        boot_params.screen_info.green_size = dac_size;
+        boot_params.screen_info.blue_size = dac_size;
+        boot_params.screen_info.rsvd_size = dac_size;
+        boot_params.screen_info.red_pos = 0;
+        boot_params.screen_info.green_pos = 0;
+        boot_params.screen_info.blue_pos = 0;
+        boot_params.screen_info.rsvd_pos = 0;
+}
+/* Save the VESA protected mode info */
+static void vesa_store_pm_info(void)
+{
+        u16 ax, bx, di, es;
+        ax = 0x4f0a;
+        bx = di = 0;
+        asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es"
+            : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di)
+            : : "ecx", "esi");
+        if (ax != 0x004f)
+                return;
+        boot_params.screen_info.vesapm_seg = es;
+        boot_params.screen_info.vesapm_off = di;
+}
+/*
+ * Save video mode parameters for graphics mode
+ */
+static void vesa_store_mode_params_graphics(void)
+{
+        /* Tell the kernel we're in VESA graphics mode */
+        boot_params.screen_info.orig_video_isVGA = 0x23;
+        /* Mode parameters */
+        boot_params.screen_info.vesa_attributes = vminfo.mode_attr;
+        boot_params.screen_info.lfb_linelength = vminfo.logical_scan;
+        boot_params.screen_info.lfb_width = vminfo.h_res;
+        boot_params.screen_info.lfb_height = vminfo.v_res;
+        boot_params.screen_info.lfb_depth = vminfo.bpp;
+        boot_params.screen_info.pages = vminfo.image_planes;
+        boot_params.screen_info.lfb_base = vminfo.lfb_ptr;
+        memcpy(&boot_params.screen_info.red_size,
+               &vminfo.rmask, 8);
+        /* General parameters */
+        boot_params.screen_info.lfb_size = vginfo.total_memory;
+        if (vminfo.bpp <= 8)
+                vesa_dac_set_8bits();
+        vesa_store_pm_info();
+}
+/*
+ * Save EDID information for the kernel; this is invoked, separately,
+ * after mode-setting.
+ */
+void vesa_store_edid(void)
+{
+#ifdef CONFIG_FIRMWARE_EDID
+        u16 ax, bx, cx, dx, di;
+        /* Apparently used as a nonsense token... */
+        memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
+        if (vginfo.version < 0x0200)
+                return;         /* EDID requires VBE 2.0+ */
+        ax = 0x4f15;            /* VBE DDC */
+        bx = 0x0000;            /* Report DDC capabilities */
+        cx = 0;                 /* Controller 0 */
+        di = 0;                 /* ES:DI must be 0 by spec */
+        /* Note: The VBE DDC spec is different from the main VESA spec;
+           we genuinely have to assume all registers are destroyed here. */
+        asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
+            : "+a" (ax), "+b" (bx)
+            :  "c" (cx), "D" (di)
+            : "esi");
+        if (ax != 0x004f)
+                return;         /* No EDID */
+        /* BH = time in seconds to transfer EDD information */
+        /* BL = DDC level supported */
+        ax = 0x4f15;            /* VBE DDC */
+        bx = 0x0001;            /* Read EDID */
+        cx = 0;                 /* Controller 0 */
+        dx = 0;                 /* EDID block number */
+        di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */
+        asm(INT10
+            : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info)
+            : "c" (cx), "D" (di)
+            : "esi");
+#endif /* CONFIG_FIRMWARE_EDID */
+}
+__videocard video_vesa =
+{
+        .card_name      = "VESA",
+        .probe          = vesa_probe,
+        .set_mode       = vesa_set_mode,
+        .xmode_first    = VIDEO_FIRST_VESA,
+        .xmode_n        = 0x200,
+};
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
new file mode 100644
index 000000000000..aef02f9ec0c1
--- /dev/null
+++ b/arch/x86/boot/video-vga.c
@@ -0,0 +1,261 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/video-vga.c
+ *
+ * Common all-VGA modes
+ */
+#include "boot.h"
+#include "video.h"
+static struct mode_info vga_modes[] = {
+        { VIDEO_80x25,  80, 25 },
+        { VIDEO_8POINT, 80, 50 },
+        { VIDEO_80x43,  80, 43 },
+        { VIDEO_80x28,  80, 28 },
+        { VIDEO_80x30,  80, 30 },
+        { VIDEO_80x34,  80, 34 },
+        { VIDEO_80x60,  80, 60 },
+};
+static struct mode_info ega_modes[] = {
+        { VIDEO_80x25,  80, 25 },
+        { VIDEO_8POINT, 80, 43 },
+};
+static struct mode_info cga_modes[] = {
+        { VIDEO_80x25,  80, 25 },
+};
+__videocard video_vga;
+/* Set basic 80x25 mode */
+static u8 vga_set_basic_mode(void)
+{
+        u16 ax;
+        u8 rows;
+        u8 mode;
+#ifdef CONFIG_VIDEO_400_HACK
+        if (adapter >= ADAPTER_VGA) {
+                asm volatile(INT10
+                             : : "a" (0x1202), "b" (0x0030)
+                             : "ecx", "edx", "esi", "edi");
+        }
+#endif
+        ax = 0x0f00;
+        asm volatile(INT10
+                     : "+a" (ax)
+                     : : "ebx", "ecx", "edx", "esi", "edi");
+        mode = (u8)ax;
+        set_fs(0);
+        rows = rdfs8(0x484);    /* rows minus one */
+#ifndef CONFIG_VIDEO_400_HACK
+        if ((ax == 0x5003 || ax == 0x5007) &&
+            (rows == 0 || rows == 24))
+                return mode;
+#endif
+        if (mode != 3 && mode != 7)
+                mode = 3;
+        /* Set the mode */
+        ax = mode;
+        asm volatile(INT10
+                     : "+a" (ax)
+                     : : "ebx", "ecx", "edx", "esi", "edi");
+        do_restore = 1;
+        return mode;
+}
+static void vga_set_8font(void)
+{
+        /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
+        /* Set 8x8 font */
+        asm volatile(INT10 : : "a" (0x1112), "b" (0));
+        /* Use alternate print screen */
+        asm volatile(INT10 : : "a" (0x1200), "b" (0x20));
+        /* Turn off cursor emulation */
+        asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+        /* Cursor is scan lines 6-7 */
+        asm volatile(INT10 : : "a" (0x0100), "c" (0x0607));
+}
+static void vga_set_14font(void)
+{
+        /* Set 9x14 font - 80x28 on VGA */
+        /* Set 9x14 font */
+        asm volatile(INT10 : : "a" (0x1111), "b" (0));
+        /* Turn off cursor emulation */
+        asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+        /* Cursor is scan lines 11-12 */
+        asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c));
+}
+static void vga_set_80x43(void)
+{
+        /* Set 80x43 mode on VGA (not EGA) */
+        /* Set 350 scans */
+        asm volatile(INT10 : : "a" (0x1201), "b" (0x30));
+        /* Reset video mode */
+        asm volatile(INT10 : : "a" (0x0003));
+        vga_set_8font();
+}
+/* I/O address of the VGA CRTC */
+u16 vga_crtc(void)
+{
+        return (inb(0x3cc) & 1) ? 0x3d4 : 0x3b4;
+}
+static void vga_set_480_scanlines(int end)
+{
+        u16 crtc;
+        u8  csel;
+        crtc = vga_crtc();
+        out_idx(0x0c, crtc, 0x11); /* Vertical sync end, unlock CR0-7 */
+        out_idx(0x0b, crtc, 0x06); /* Vertical total */
+        out_idx(0x3e, crtc, 0x07); /* Vertical overflow */
+        out_idx(0xea, crtc, 0x10); /* Vertical sync start */
+        out_idx(end, crtc, 0x12); /* Vertical display end */
+        out_idx(0xe7, crtc, 0x15); /* Vertical blank start */
+        out_idx(0x04, crtc, 0x16); /* Vertical blank end */
+        csel = inb(0x3cc);
+        csel &= 0x0d;
+        csel |= 0xe2;
+        outb(csel, 0x3cc);
+}
+static void vga_set_80x30(void)
+{
+        vga_set_480_scanlines(0xdf);
+}
+static void vga_set_80x34(void)
+{
+        vga_set_14font();
+        vga_set_480_scanlines(0xdb);
+}
+static void vga_set_80x60(void)
+{
+        vga_set_8font();
+        vga_set_480_scanlines(0xdf);
+}
+static int vga_set_mode(struct mode_info *mode)
+{
+        /* Set the basic mode */
+        vga_set_basic_mode();
+        /* Override a possibly broken BIOS */
+        force_x = mode->x;
+        force_y = mode->y;
+        switch (mode->mode) {
+        case VIDEO_80x25:
+                break;
+        case VIDEO_8POINT:
+                vga_set_8font();
+                break;
+        case VIDEO_80x43:
+                vga_set_80x43();
+                break;
+        case VIDEO_80x28:
+                vga_set_14font();
+                break;
+        case VIDEO_80x30:
+                vga_set_80x30();
+                break;
+        case VIDEO_80x34:
+                vga_set_80x34();
+                break;
+        case VIDEO_80x60:
+                vga_set_80x60();
+                break;
+        }
+        return 0;
+}
+/*
+ * Note: this probe includes basic information required by all
+ * systems.  It should be executed first, by making sure
+ * video-vga.c is listed first in the Makefile.
+ */
+static int vga_probe(void)
+{
+        static const char *card_name[] = {
+                "CGA/MDA/HGC", "EGA", "VGA"
+        };
+        static struct mode_info *mode_lists[] = {
+                cga_modes,
+                ega_modes,
+                vga_modes,
+        };
+        static int mode_count[] = {
+                sizeof(cga_modes)/sizeof(struct mode_info),
+                sizeof(ega_modes)/sizeof(struct mode_info),
+                sizeof(vga_modes)/sizeof(struct mode_info),
+        };
+        u8 vga_flag;
+        asm(INT10
+            : "=b" (boot_params.screen_info.orig_video_ega_bx)
+            : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */
+            : "ecx", "edx", "esi", "edi");
+        /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
+        if ((u8)boot_params.screen_info.orig_video_ega_bx != 0x10) {
+                /* EGA/VGA */
+                asm(INT10
+                    : "=a" (vga_flag)
+                    : "a" (0x1a00)
+                    : "ebx", "ecx", "edx", "esi", "edi");
+                if (vga_flag == 0x1a) {
+                        adapter = ADAPTER_VGA;
+                        boot_params.screen_info.orig_video_isVGA = 1;
+                } else {
+                        adapter = ADAPTER_EGA;
+                }
+        } else {
+                adapter = ADAPTER_CGA;
+        }
+        video_vga.modes = mode_lists[adapter];
+        video_vga.card_name = card_name[adapter];
+        return mode_count[adapter];
+}
+__videocard video_vga =
+{
+        .card_name      = "VGA",
+        .probe          = vga_probe,
+        .set_mode       = vga_set_mode,
+};
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
new file mode 100644
index 000000000000..e4ba897bf9a3
--- /dev/null
+++ b/arch/x86/boot/video.c
@@ -0,0 +1,467 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/video.c
+ *
+ * Select video mode
+ */
+#include "boot.h"
+#include "video.h"
+#include "vesa.h"
+/*
+ * Mode list variables
+ */
+static struct card_info cards[];    /* List of cards to probe for */
+/*
+ * Common variables
+ */
+int adapter;                    /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
+u16 video_segment;
+int force_x, force_y;   /* Don't query the BIOS for cols/rows */
+int do_restore = 0;     /* Screen contents changed during mode flip */
+int graphic_mode;       /* Graphic mode with linear frame buffer */
+static void store_cursor_position(void)
+{
+        u16 curpos;
+        u16 ax, bx;
+        ax = 0x0300;
+        bx = 0;
+        asm(INT10
+            : "=d" (curpos), "+a" (ax), "+b" (bx)
+            : : "ecx", "esi", "edi");
+        boot_params.screen_info.orig_x = curpos;
+        boot_params.screen_info.orig_y = curpos >> 8;
+}
+static void store_video_mode(void)
+{
+        u16 ax, page;
+        /* N.B.: the saving of the video page here is a bit silly,
+           since we pretty much assume page 0 everywhere. */
+        ax = 0x0f00;
+        asm(INT10
+            : "+a" (ax), "=b" (page)
+            : : "ecx", "edx", "esi", "edi");
+        /* Not all BIOSes are clean with respect to the top bit */
+        boot_params.screen_info.orig_video_mode = ax & 0x7f;
+        boot_params.screen_info.orig_video_page = page >> 8;
+}
+/*
+ * Store the video mode parameters for later usage by the kernel.
+ * This is done by asking the BIOS except for the rows/columns
+ * parameters in the default 80x25 mode -- these are set directly,
+ * because some very obscure BIOSes supply insane values.
+ */
+static void store_mode_params(void)
+{
+        u16 font_size;
+        int x, y;
+        /* For graphics mode, it is up to the mode-setting driver
+           (currently only video-vesa.c) to store the parameters */
+        if (graphic_mode)
+                return;
+        store_cursor_position();
+        store_video_mode();
+        if (boot_params.screen_info.orig_video_mode == 0x07) {
+                /* MDA, HGC, or VGA in monochrome mode */
+                video_segment = 0xb000;
+        } else {
+                /* CGA, EGA, VGA and so forth */
+                video_segment = 0xb800;
+        }
+        set_fs(0);
+        font_size = rdfs16(0x485); /* Font size, BIOS area */
+        boot_params.screen_info.orig_video_points = font_size;
+        x = rdfs16(0x44a);
+        y = (adapter == ADAPTER_CGA) ? 25 : rdfs8(0x484)+1;
+        if (force_x)
+                x = force_x;
+        if (force_y)
+                y = force_y;
+        boot_params.screen_info.orig_video_cols  = x;
+        boot_params.screen_info.orig_video_lines = y;
+}
+/* Probe the video drivers and have them generate their mode lists. */
+static void probe_cards(int unsafe)
+{
+        struct card_info *card;
+        static u8 probed[2];
+        if (probed[unsafe])
+                return;
+        probed[unsafe] = 1;
+        for (card = video_cards; card < video_cards_end; card++) {
+                if (card->unsafe == unsafe) {
+                        if (card->probe)
+                                card->nmodes = card->probe();
+                        else
+                                card->nmodes = 0;
+                }
+        }
+}
+/* Test if a mode is defined */
+int mode_defined(u16 mode)
+{
+        struct card_info *card;
+        struct mode_info *mi;
+        int i;
+        for (card = video_cards; card < video_cards_end; card++) {
+                mi = card->modes;
+                for (i = 0; i < card->nmodes; i++, mi++) {
+                        if (mi->mode == mode)
+                                return 1;
+                }
+        }
+        return 0;
+}
+/* Set mode (without recalc) */
+static int raw_set_mode(u16 mode, u16 *real_mode)
+{
+        int nmode, i;
+        struct card_info *card;
+        struct mode_info *mi;
+        /* Drop the recalc bit if set */
+        mode &= ~VIDEO_RECALC;
+        /* Scan for mode based on fixed ID, position, or resolution */
+        nmode = 0;
+        for (card = video_cards; card < video_cards_end; card++) {
+                mi = card->modes;
+                for (i = 0; i < card->nmodes; i++, mi++) {
+                        int visible = mi->x || mi->y;
+                        if ((mode == nmode && visible) ||
+                            mode == mi->mode ||
+                            mode == (mi->y << 8)+mi->x) {
+                                *real_mode = mi->mode;
+                                return card->set_mode(mi);
+                        }
+                        if (visible)
+                                nmode++;
+                }
+        }
+        /* Nothing found?  Is it an "exceptional" (unprobed) mode? */
+        for (card = video_cards; card < video_cards_end; card++) {
+                if (mode >= card->xmode_first &&
+                    mode < card->xmode_first+card->xmode_n) {
+                        struct mode_info mix;
+                        *real_mode = mix.mode = mode;
+                        mix.x = mix.y = 0;
+                        return card->set_mode(&mix);
+                }
+        }
+        /* Otherwise, failure... */
+        return -1;
+}
+/*
+ * Recalculate the vertical video cutoff (hack!)
+ */
+static void vga_recalc_vertical(void)
+{
+        unsigned int font_size, rows;
+        u16 crtc;
+        u8 pt, ov;
+        set_fs(0);
+        font_size = rdfs8(0x485); /* BIOS: font size (pixels) */
+        rows = force_y ? force_y : rdfs8(0x484)+1; /* Text rows */
+        rows *= font_size;      /* Visible scan lines */
+        rows--;                 /* ... minus one */
+        crtc = vga_crtc();
+        pt = in_idx(crtc, 0x11);
+        pt &= ~0x80;            /* Unlock CR0-7 */
+        out_idx(pt, crtc, 0x11);
+        out_idx((u8)rows, crtc, 0x12); /* Lower height register */
+        ov = in_idx(crtc, 0x07); /* Overflow register */
+        ov &= 0xbd;
+        ov |= (rows >> (8-1)) & 0x02;
+        ov |= (rows >> (9-6)) & 0x40;
+        out_idx(ov, crtc, 0x07);
+}
+/* Set mode (with recalc if specified) */
+static int set_mode(u16 mode)
+{
+        int rv;
+        u16 real_mode;
+        /* Very special mode numbers... */
+        if (mode == VIDEO_CURRENT_MODE)
+                return 0;       /* Nothing to do... */
+        else if (mode == NORMAL_VGA)
+                mode = VIDEO_80x25;
+        else if (mode == EXTENDED_VGA)
+                mode = VIDEO_8POINT;
+        rv = raw_set_mode(mode, &real_mode);
+        if (rv)
+                return rv;
+        if (mode & VIDEO_RECALC)
+                vga_recalc_vertical();
+        /* Save the canonical mode number for the kernel, not
+           an alias, size specification or menu position */
+        boot_params.hdr.vid_mode = real_mode;
+        return 0;
+}
+static unsigned int get_entry(void)
+{
+        char entry_buf[4];
+        int i, len = 0;
+        int key;
+        unsigned int v;
+        do {
+                key = getchar();
+                if (key == '\b') {
+                        if (len > 0) {
+                                puts("\b \b");
+                                len--;
+                        }
+                } else if ((key >= '0' && key <= '9') ||
+                           (key >= 'A' && key <= 'Z') ||
+                           (key >= 'a' && key <= 'z')) {
+                        if (len < sizeof entry_buf) {
+                                entry_buf[len++] = key;
+                                putchar(key);
+                        }
+                }
+        } while (key != '\r');
+        putchar('\n');
+        if (len == 0)
+                return VIDEO_CURRENT_MODE; /* Default */
+        v = 0;
+        for (i = 0; i < len; i++) {
+                v <<= 4;
+                key = entry_buf[i] | 0x20;
+                v += (key > '9') ? key-'a'+10 : key-'0';
+        }
+        return v;
+}
+static void display_menu(void)
+{
+        struct card_info *card;
+        struct mode_info *mi;
+        char ch;
+        int i;
+        puts("Mode:    COLSxROWS:\n");
+        ch = '0';
+        for (card = video_cards; card < video_cards_end; card++) {
+                mi = card->modes;
+                for (i = 0; i < card->nmodes; i++, mi++) {
+                        int visible = mi->x && mi->y;
+                        u16 mode_id = mi->mode ? mi->mode :
+                                (mi->y << 8)+mi->x;
+                        if (!visible)
+                                continue; /* Hidden mode */
+                        printf("%c  %04X  %3dx%-3d  %s\n",
+                               ch, mode_id, mi->x, mi->y, card->card_name);
+                        if (ch == '9')
+                                ch = 'a';
+                        else if (ch == 'z' || ch == ' ')
+                                ch = ' '; /* Out of keys... */
+                        else
+                                ch++;
+                }
+        }
+}
+#define H(x)    ((x)-'a'+10)
+#define SCAN    ((H('s')<<12)+(H('c')<<8)+(H('a')<<4)+H('n'))
+static unsigned int mode_menu(void)
+{
+        int key;
+        unsigned int sel;
+        puts("Press <ENTER> to see video modes available, "
+             "<SPACE> to continue, or wait 30 sec\n");
+        kbd_flush();
+        while (1) {
+                key = getchar_timeout();
+                if (key == ' ' || key == 0)
+                        return VIDEO_CURRENT_MODE; /* Default */
+                if (key == '\r')
+                        break;
+                putchar('\a');  /* Beep! */
+        }
+        for (;;) {
+                display_menu();
+                puts("Enter a video mode or \"scan\" to scan for "
+                     "additional modes: ");
+                sel = get_entry();
+                if (sel != SCAN)
+                        return sel;
+                probe_cards(1);
+        }
+}
+#ifdef CONFIG_VIDEO_RETAIN
+/* Save screen content to the heap */
+struct saved_screen {
+        int x, y;
+        int curx, cury;
+        u16 *data;
+} saved;
+static void save_screen(void)
+{
+        /* Should be called after store_mode_params() */
+        saved.x = boot_params.screen_info.orig_video_cols;
+        saved.y = boot_params.screen_info.orig_video_lines;
+        saved.curx = boot_params.screen_info.orig_x;
+        saved.cury = boot_params.screen_info.orig_y;
+        if (heap_free() < saved.x*saved.y*sizeof(u16)+512)
+                return;         /* Not enough heap to save the screen */
+        saved.data = GET_HEAP(u16, saved.x*saved.y);
+        set_fs(video_segment);
+        copy_from_fs(saved.data, 0, saved.x*saved.y*sizeof(u16));
+}
+static void restore_screen(void)
+{
+        /* Should be called after store_mode_params() */
+        int xs = boot_params.screen_info.orig_video_cols;
+        int ys = boot_params.screen_info.orig_video_lines;
+        int y;
+        addr_t dst = 0;
+        u16 *src = saved.data;
+        u16 ax, bx, dx;
+        if (graphic_mode)
+                return;         /* Can't restore onto a graphic mode */
+        if (!src)
+                return;         /* No saved screen contents */
+        /* Restore screen contents */
+        set_fs(video_segment);
+        for (y = 0; y < ys; y++) {
+                int npad;
+                if (y < saved.y) {
+                        int copy = (xs < saved.x) ? xs : saved.x;
+                        copy_to_fs(dst, src, copy*sizeof(u16));
+                        dst += copy*sizeof(u16);
+                        src += saved.x;
+                        npad = (xs < saved.x) ? 0 : xs-saved.x;
+                } else {
+                        npad = xs;
+                }
+                /* Writes "npad" blank characters to
+                   video_segment:dst and advances dst */
+                asm volatile("pushw %%es ; "
+                             "movw %2,%%es ; "
+                             "shrw %%cx ; "
+                             "jnc 1f ; "
+                             "stosw \n\t"
+                             "1: rep;stosl ; "
+                             "popw %%es"
+                             : "+D" (dst), "+c" (npad)
+                             : "bdS" (video_segment),
+                               "a" (0x07200720));
+        }
+        /* Restore cursor position */
+        ax = 0x0200;            /* Set cursor position */
+        bx = 0;                 /* Page number (<< 8) */
+        dx = (saved.cury << 8)+saved.curx;
+        asm volatile(INT10
+                     : "+a" (ax), "+b" (bx), "+d" (dx)
+                     : : "ecx", "esi", "edi");
+}
+#else
+#define save_screen()           ((void)0)
+#define restore_screen()        ((void)0)
+#endif
+void set_video(void)
+{
+        u16 mode = boot_params.hdr.vid_mode;
+        RESET_HEAP();
+        store_mode_params();
+        save_screen();
+        probe_cards(0);
+        for (;;) {
+                if (mode == ASK_VGA)
+                        mode = mode_menu();
+                if (!set_mode(mode))
+                        break;
+                printf("Undefined video mode number: %x\n", mode);
+                mode = ASK_VGA;
+        }
+        vesa_store_edid();
+        store_mode_params();
+        if (do_restore)
+                restore_screen();
+}
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
new file mode 100644
index 000000000000..b92447d51213
--- /dev/null
+++ b/arch/x86/boot/video.h
@@ -0,0 +1,152 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/video.h
+ *
+ * Header file for the real-mode video probing code
+ */
+#ifndef BOOT_VIDEO_H
+#define BOOT_VIDEO_H
+#include <linux/types.h>
+/* Enable autodetection of SVGA adapters and modes. */
+#undef CONFIG_VIDEO_SVGA
+/* Enable autodetection of VESA modes */
+#define CONFIG_VIDEO_VESA
+/* Retain screen contents when switching modes */
+#define CONFIG_VIDEO_RETAIN
+/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */
+#undef CONFIG_VIDEO_400_HACK
+/* This code uses an extended set of video mode numbers. These include:
+ * Aliases for standard modes
+ *      NORMAL_VGA (-1)
+ *      EXTENDED_VGA (-2)
+ *      ASK_VGA (-3)
+ * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
+ * of compatibility when extending the table. These are between 0x00 and 0xff.
+ */
+#define VIDEO_FIRST_MENU 0x0000
+/* Standard BIOS video modes (BIOS number + 0x0100) */
+#define VIDEO_FIRST_BIOS 0x0100
+/* VESA BIOS video modes (VESA number + 0x0200) */
+#define VIDEO_FIRST_VESA 0x0200
+/* Video7 special modes (BIOS number + 0x0900) */
+#define VIDEO_FIRST_V7 0x0900
+/* Special video modes */
+#define VIDEO_FIRST_SPECIAL 0x0f00
+#define VIDEO_80x25 0x0f00
+#define VIDEO_8POINT 0x0f01
+#define VIDEO_80x43 0x0f02
+#define VIDEO_80x28 0x0f03
+#define VIDEO_CURRENT_MODE 0x0f04
+#define VIDEO_80x30 0x0f05
+#define VIDEO_80x34 0x0f06
+#define VIDEO_80x60 0x0f07
+#define VIDEO_GFX_HACK 0x0f08
+#define VIDEO_LAST_SPECIAL 0x0f09
+/* Video modes given by resolution */
+#define VIDEO_FIRST_RESOLUTION 0x1000
+/* The "recalculate timings" flag */
+#define VIDEO_RECALC 0x8000
+/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
+#ifdef CONFIG_VIDEO_RETAIN
+void store_screen(void);
+#define DO_STORE() store_screen()
+#else
+#define DO_STORE() ((void)0)
+#endif /* CONFIG_VIDEO_RETAIN */
+/*
+ * Mode table structures
+ */
+struct mode_info {
+        u16 mode;               /* Mode number (vga= style) */
+        u8  x, y;               /* Width, height */
+};
+struct card_info {
+        const char *card_name;
+        int (*set_mode)(struct mode_info *mode);
+        int (*probe)(void);
+        struct mode_info *modes;
+        int nmodes;             /* Number of probed modes so far */
+        int unsafe;             /* Probing is unsafe, only do after "scan" */
+        u16 xmode_first;        /* Unprobed modes to try to call anyway */
+        u16 xmode_n;            /* Size of unprobed mode range */
+};
+#define __videocard struct card_info __attribute__((section(".videocards")))
+extern struct card_info video_cards[], video_cards_end[];
+int mode_defined(u16 mode);     /* video.c */
+/* Basic video information */
+#define ADAPTER_CGA     0       /* CGA/MDA/HGC */
+#define ADAPTER_EGA     1
+#define ADAPTER_VGA     2
+extern int adapter;
+extern u16 video_segment;
+extern int force_x, force_y;    /* Don't query the BIOS for cols/rows */
+extern int do_restore;          /* Restore screen contents */
+extern int graphic_mode;        /* Graphics mode with linear frame buffer */
+/*
+ * int $0x10 is notorious for touching registers it shouldn't.
+ * gcc doesn't like %ebp being clobbered, so define it as a push/pop
+ * sequence here.
+ *
+ * A number of systems, including the original PC can clobber %bp in
+ * certain circumstances, like when scrolling.  There exists at least
+ * one Trident video card which could clobber DS under a set of
+ * circumstances that we are unlikely to encounter (scrolling when
+ * using an extended graphics mode of more than 800x600 pixels), but
+ * it's cheap insurance to deal with that here.
+ */
+#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
+/* Accessing VGA indexed registers */
+static inline u8 in_idx(u16 port, u8 index)
+{
+        outb(index, port);
+        return inb(port+1);
+}
+static inline void out_idx(u8 v, u16 port, u8 index)
+{
+        outw(index+(v << 8), port);
+}
+/* Writes a value to an indexed port and then reads the port again */
+static inline u8 tst_idx(u8 v, u16 port, u8 index)
+{
+        out_idx(port, index, v);
+        return in_idx(port, index);
+}
+/* Get the I/O port of the VGA CRTC */
+u16 vga_crtc(void);             /* video-vga.c */
+#endif /* BOOT_VIDEO_H */
diff --git a/arch/x86/boot/voyager.c b/arch/x86/boot/voyager.c
new file mode 100644
index 000000000000..61c8fe0453be
--- /dev/null
+++ b/arch/x86/boot/voyager.c
@@ -0,0 +1,46 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright (C) 1991, 1992 Linus Torvalds
+ *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * arch/i386/boot/voyager.c
+ *
+ * Get the Voyager config information
+ */
+#include "boot.h"
+#ifdef CONFIG_X86_VOYAGER
+int query_voyager(void)
+{
+        u8 err;
+        u16 es, di;
+        /* Abuse the apm_bios_info area for this */
+        u8 *data_ptr = (u8 *)&boot_params.apm_bios_info;
+        data_ptr[0] = 0xff;     /* Flag on config not found(?) */
+        asm("pushw %%es ; "
+            "int $0x15 ; "
+            "setc %0 ; "
+            "movw %%es, %1 ; "
+            "popw %%es"
+            : "=q" (err), "=r" (es), "=D" (di)
+            : "a" (0xffc0));
+        if (err)
+                return -1;      /* Not Voyager */
+        set_fs(es);
+        copy_from_fs(data_ptr, di, 7);  /* Table is 7 bytes apparently */
+        return 0;
+}
+#endif /* CONFIG_X86_VOYAGER */
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
new file mode 100644
index 000000000000..18dcdc6fb7aa
--- /dev/null
+++ b/arch/x86/crypto/Makefile
@@ -0,0 +1,5 @@
+ifeq ($(CONFIG_X86_32),y)
+include ${srctree}/arch/x86/crypto/Makefile_32
+else
+include ${srctree}/arch/x86/crypto/Makefile_64
+endif
diff --git a/arch/x86/crypto/Makefile_32 b/arch/x86/crypto/Makefile_32
new file mode 100644
index 000000000000..2d873a2388ed
--- /dev/null
+++ b/arch/x86/crypto/Makefile_32
@@ -0,0 +1,12 @@
+# 
+# x86/crypto/Makefile 
+# 
+# Arch-specific CryptoAPI modules.
+# 
+obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
+obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
+aes-i586-y := aes-i586-asm_32.o aes_32.o
+twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
diff --git a/arch/x86/crypto/Makefile_64 b/arch/x86/crypto/Makefile_64
new file mode 100644
index 000000000000..b40896276e93
--- /dev/null
+++ b/arch/x86/crypto/Makefile_64
@@ -0,0 +1,12 @@
+# 
+# x86/crypto/Makefile 
+# 
+# Arch-specific CryptoAPI modules.
+# 
+obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
+aes-x86_64-y := aes-x86_64-asm_64.o aes_64.o
+twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S
new file mode 100644
index 000000000000..f942f0c8f630
--- /dev/null
+++ b/arch/x86/crypto/aes-i586-asm_32.S
@@ -0,0 +1,373 @@
+// -------------------------------------------------------------------------
+// Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK.
+// All rights reserved.
+//
+// LICENSE TERMS
+//
+// The free distribution and use of this software in both source and binary 
+// form is allowed (with or without changes) provided that:
+//
+//   1. distributions of this source code include the above copyright 
+//      notice, this list of conditions and the following disclaimer//
+//
+//   2. distributions in binary form include the above copyright
+//      notice, this list of conditions and the following disclaimer
+//      in the documentation and/or other associated materials//
+//
+//   3. the copyright holder's name is not used to endorse products 
+//      built using this software without specific written permission.
+//
+//
+// ALTERNATIVELY, provided that this notice is retained in full, this product
+// may be distributed under the terms of the GNU General Public License (GPL),
+// in which case the provisions of the GPL apply INSTEAD OF those given above.
+//
+// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org>
+// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+// DISCLAIMER
+//
+// This software is provided 'as is' with no explicit or implied warranties
+// in respect of its properties including, but not limited to, correctness 
+// and fitness for purpose.
+// -------------------------------------------------------------------------
+// Issue Date: 29/07/2002
+.file "aes-i586-asm.S"
+.text
+#include <asm/asm-offsets.h>
+#define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words)
+/* offsets to parameters with one register pushed onto stack */
+#define tfm 8
+#define out_blk 12
+#define in_blk 16
+/* offsets in crypto_tfm structure */
+#define ekey (crypto_tfm_ctx_offset + 0)
+#define nrnd (crypto_tfm_ctx_offset + 256)
+#define dkey (crypto_tfm_ctx_offset + 260)
+// register mapping for encrypt and decrypt subroutines
+#define r0  eax
+#define r1  ebx
+#define r2  ecx
+#define r3  edx
+#define r4  esi
+#define r5  edi
+#define eaxl  al
+#define eaxh  ah
+#define ebxl  bl
+#define ebxh  bh
+#define ecxl  cl
+#define ecxh  ch
+#define edxl  dl
+#define edxh  dh
+#define _h(reg) reg##h
+#define h(reg) _h(reg)
+#define _l(reg) reg##l
+#define l(reg) _l(reg)
+// This macro takes a 32-bit word representing a column and uses
+// each of its four bytes to index into four tables of 256 32-bit
+// words to obtain values that are then xored into the appropriate
+// output registers r0, r1, r4 or r5.  
+// Parameters:
+// table table base address
+//   %1  out_state[0]
+//   %2  out_state[1]
+//   %3  out_state[2]
+//   %4  out_state[3]
+//   idx input register for the round (destroyed)
+//   tmp scratch register for the round
+// sched key schedule
+#define do_col(table, a1,a2,a3,a4, idx, tmp)    \
+        movzx   %l(idx),%tmp;                   \
+        xor     table(,%tmp,4),%a1;             \
+        movzx   %h(idx),%tmp;                   \
+        shr     $16,%idx;                       \
+        xor     table+tlen(,%tmp,4),%a2;        \
+        movzx   %l(idx),%tmp;                   \
+        movzx   %h(idx),%idx;                   \
+        xor     table+2*tlen(,%tmp,4),%a3;      \
+        xor     table+3*tlen(,%idx,4),%a4;
+// initialise output registers from the key schedule
+// NB1: original value of a3 is in idx on exit
+// NB2: original values of a1,a2,a4 aren't used
+#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
+        mov     0 sched,%a1;                    \
+        movzx   %l(idx),%tmp;                   \
+        mov     12 sched,%a2;                   \
+        xor     table(,%tmp,4),%a1;             \
+        mov     4 sched,%a4;                    \
+        movzx   %h(idx),%tmp;                   \
+        shr     $16,%idx;                       \
+        xor     table+tlen(,%tmp,4),%a2;        \
+        movzx   %l(idx),%tmp;                   \
+        movzx   %h(idx),%idx;                   \
+        xor     table+3*tlen(,%idx,4),%a4;      \
+        mov     %a3,%idx;                       \
+        mov     8 sched,%a3;                    \
+        xor     table+2*tlen(,%tmp,4),%a3;
+// initialise output registers from the key schedule
+// NB1: original value of a3 is in idx on exit
+// NB2: original values of a1,a2,a4 aren't used
+#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
+        mov     0 sched,%a1;                    \
+        movzx   %l(idx),%tmp;                   \
+        mov     4 sched,%a2;                    \
+        xor     table(,%tmp,4),%a1;             \
+        mov     12 sched,%a4;                   \
+        movzx   %h(idx),%tmp;                   \
+        shr     $16,%idx;                       \
+        xor     table+tlen(,%tmp,4),%a2;        \
+        movzx   %l(idx),%tmp;                   \
+        movzx   %h(idx),%idx;                   \
+        xor     table+3*tlen(,%idx,4),%a4;      \
+        mov     %a3,%idx;                       \
+        mov     8 sched,%a3;                    \
+        xor     table+2*tlen(,%tmp,4),%a3;
+// original Gladman had conditional saves to MMX regs.
+#define save(a1, a2)            \
+        mov     %a2,4*a1(%esp)
+#define restore(a1, a2)         \
+        mov     4*a2(%esp),%a1
+// These macros perform a forward encryption cycle. They are entered with
+// the first previous round column values in r0,r1,r4,r5 and
+// exit with the final values in the same registers, using stack
+// for temporary storage.
+// round column values
+// on entry: r0,r1,r4,r5
+// on exit:  r2,r1,r4,r5
+#define fwd_rnd1(arg, table)                                            \
+        save   (0,r1);                                                  \
+        save   (1,r5);                                                  \
+                                                                        \
+        /* compute new column values */                                 \
+        do_fcol(table, r2,r5,r4,r1, r0,r3, arg);        /* idx=r0 */    \
+        do_col (table, r4,r1,r2,r5, r0,r3);             /* idx=r4 */    \
+        restore(r0,0);                                                  \
+        do_col (table, r1,r2,r5,r4, r0,r3);             /* idx=r1 */    \
+        restore(r0,1);                                                  \
+        do_col (table, r5,r4,r1,r2, r0,r3);             /* idx=r5 */
+// round column values
+// on entry: r2,r1,r4,r5
+// on exit:  r0,r1,r4,r5
+#define fwd_rnd2(arg, table)                                            \
+        save   (0,r1);                                                  \
+        save   (1,r5);                                                  \
+                                                                        \
+        /* compute new column values */                                 \
+        do_fcol(table, r0,r5,r4,r1, r2,r3, arg);        /* idx=r2 */    \
+        do_col (table, r4,r1,r0,r5, r2,r3);             /* idx=r4 */    \
+        restore(r2,0);                                                  \
+        do_col (table, r1,r0,r5,r4, r2,r3);             /* idx=r1 */    \
+        restore(r2,1);                                                  \
+        do_col (table, r5,r4,r1,r0, r2,r3);             /* idx=r5 */
+// These macros performs an inverse encryption cycle. They are entered with
+// the first previous round column values in r0,r1,r4,r5 and
+// exit with the final values in the same registers, using stack
+// for temporary storage
+// round column values
+// on entry: r0,r1,r4,r5
+// on exit:  r2,r1,r4,r5
+#define inv_rnd1(arg, table)                                            \
+        save    (0,r1);                                                 \
+        save    (1,r5);                                                 \
+                                                                        \
+        /* compute new column values */                                 \
+        do_icol(table, r2,r1,r4,r5, r0,r3, arg);        /* idx=r0 */    \
+        do_col (table, r4,r5,r2,r1, r0,r3);             /* idx=r4 */    \
+        restore(r0,0);                                                  \
+        do_col (table, r1,r4,r5,r2, r0,r3);             /* idx=r1 */    \
+        restore(r0,1);                                                  \
+        do_col (table, r5,r2,r1,r4, r0,r3);             /* idx=r5 */
+// round column values
+// on entry: r2,r1,r4,r5
+// on exit:  r0,r1,r4,r5
+#define inv_rnd2(arg, table)                                            \
+        save    (0,r1);                                                 \
+        save    (1,r5);                                                 \
+                                                                        \
+        /* compute new column values */                                 \
+        do_icol(table, r0,r1,r4,r5, r2,r3, arg);        /* idx=r2 */    \
+        do_col (table, r4,r5,r0,r1, r2,r3);             /* idx=r4 */    \
+        restore(r2,0);                                                  \
+        do_col (table, r1,r4,r5,r0, r2,r3);             /* idx=r1 */    \
+        restore(r2,1);                                                  \
+        do_col (table, r5,r0,r1,r4, r2,r3);             /* idx=r5 */
+// AES (Rijndael) Encryption Subroutine
+/* void aes_enc_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */
+.global  aes_enc_blk
+.extern  ft_tab
+.extern  fl_tab
+.align 4
+aes_enc_blk:
+        push    %ebp
+        mov     tfm(%esp),%ebp
+// CAUTION: the order and the values used in these assigns 
+// rely on the register mappings
+1:      push    %ebx
+        mov     in_blk+4(%esp),%r2
+        push    %esi
+        mov     nrnd(%ebp),%r3   // number of rounds
+        push    %edi
+#if ekey != 0
+        lea     ekey(%ebp),%ebp  // key pointer
+#endif
+// input four columns and xor in first round key
+        mov     (%r2),%r0
+        mov     4(%r2),%r1
+        mov     8(%r2),%r4
+        mov     12(%r2),%r5
+        xor     (%ebp),%r0
+        xor     4(%ebp),%r1
+        xor     8(%ebp),%r4
+        xor     12(%ebp),%r5
+        sub     $8,%esp         // space for register saves on stack
+        add     $16,%ebp        // increment to next round key
+        cmp     $12,%r3
+        jb      4f              // 10 rounds for 128-bit key
+        lea     32(%ebp),%ebp
+        je      3f              // 12 rounds for 192-bit key
+        lea     32(%ebp),%ebp
+2:      fwd_rnd1( -64(%ebp) ,ft_tab)    // 14 rounds for 256-bit key
+        fwd_rnd2( -48(%ebp) ,ft_tab)
+3:      fwd_rnd1( -32(%ebp) ,ft_tab)    // 12 rounds for 192-bit key
+        fwd_rnd2( -16(%ebp) ,ft_tab)
+4:      fwd_rnd1(    (%ebp) ,ft_tab)    // 10 rounds for 128-bit key
+        fwd_rnd2( +16(%ebp) ,ft_tab)
+        fwd_rnd1( +32(%ebp) ,ft_tab)
+        fwd_rnd2( +48(%ebp) ,ft_tab)
+        fwd_rnd1( +64(%ebp) ,ft_tab)
+        fwd_rnd2( +80(%ebp) ,ft_tab)
+        fwd_rnd1( +96(%ebp) ,ft_tab)
+        fwd_rnd2(+112(%ebp) ,ft_tab)
+        fwd_rnd1(+128(%ebp) ,ft_tab)
+        fwd_rnd2(+144(%ebp) ,fl_tab)    // last round uses a different table
+// move final values to the output array.  CAUTION: the 
+// order of these assigns rely on the register mappings
+        add     $8,%esp
+        mov     out_blk+12(%esp),%ebp
+        mov     %r5,12(%ebp)
+        pop     %edi
+        mov     %r4,8(%ebp)
+        pop     %esi
+        mov     %r1,4(%ebp)
+        pop     %ebx
+        mov     %r0,(%ebp)
+        pop     %ebp
+        mov     $1,%eax
+        ret
+// AES (Rijndael) Decryption Subroutine
+/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */
+.global  aes_dec_blk
+.extern  it_tab
+.extern  il_tab
+.align 4
+aes_dec_blk:
+        push    %ebp
+        mov     tfm(%esp),%ebp
+// CAUTION: the order and the values used in these assigns 
+// rely on the register mappings
+1:      push    %ebx
+        mov     in_blk+4(%esp),%r2
+        push    %esi
+        mov     nrnd(%ebp),%r3   // number of rounds
+        push    %edi
+#if dkey != 0
+        lea     dkey(%ebp),%ebp  // key pointer
+#endif
+        mov     %r3,%r0
+        shl     $4,%r0
+        add     %r0,%ebp
+        
+// input four columns and xor in first round key
+        mov     (%r2),%r0
+        mov     4(%r2),%r1
+        mov     8(%r2),%r4
+        mov     12(%r2),%r5
+        xor     (%ebp),%r0
+        xor     4(%ebp),%r1
+        xor     8(%ebp),%r4
+        xor     12(%ebp),%r5
+        sub     $8,%esp         // space for register saves on stack
+        sub     $16,%ebp        // increment to next round key
+        cmp     $12,%r3
+        jb      4f              // 10 rounds for 128-bit key
+        lea     -32(%ebp),%ebp
+        je      3f              // 12 rounds for 192-bit key
+        lea     -32(%ebp),%ebp
+2:      inv_rnd1( +64(%ebp), it_tab)    // 14 rounds for 256-bit key
+        inv_rnd2( +48(%ebp), it_tab)
+3:      inv_rnd1( +32(%ebp), it_tab)    // 12 rounds for 192-bit key
+        inv_rnd2( +16(%ebp), it_tab)
+4:      inv_rnd1(    (%ebp), it_tab)    // 10 rounds for 128-bit key
+        inv_rnd2( -16(%ebp), it_tab)
+        inv_rnd1( -32(%ebp), it_tab)
+        inv_rnd2( -48(%ebp), it_tab)
+        inv_rnd1( -64(%ebp), it_tab)
+        inv_rnd2( -80(%ebp), it_tab)
+        inv_rnd1( -96(%ebp), it_tab)
+        inv_rnd2(-112(%ebp), it_tab)
+        inv_rnd1(-128(%ebp), it_tab)
+        inv_rnd2(-144(%ebp), il_tab)    // last round uses a different table
+// move final values to the output array.  CAUTION: the 
+// order of these assigns rely on the register mappings
+        add     $8,%esp
+        mov     out_blk+12(%esp),%ebp
+        mov     %r5,12(%ebp)
+        pop     %edi
+        mov     %r4,8(%ebp)
+        pop     %esi
+        mov     %r1,4(%ebp)
+        pop     %ebx
+        mov     %r0,(%ebp)
+        pop     %ebp
+        mov     $1,%eax
+        ret
diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S
new file mode 100644
index 000000000000..26b40de4d0b0
--- /dev/null
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -0,0 +1,190 @@
+/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64
+ *
+ * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de>
+ *
+ * License:
+ * This code can be distributed under the terms of the GNU General Public
+ * License (GPL) Version 2 provided that the above header down to and
+ * including this sentence is retained in full.
+ */
+.extern aes_ft_tab
+.extern aes_it_tab
+.extern aes_fl_tab
+.extern aes_il_tab
+.text
+#include <asm/asm-offsets.h>
+#define BASE crypto_tfm_ctx_offset
+#define R1      %rax
+#define R1E     %eax
+#define R1X     %ax
+#define R1H     %ah
+#define R1L     %al
+#define R2      %rbx
+#define R2E     %ebx
+#define R2X     %bx
+#define R2H     %bh
+#define R2L     %bl
+#define R3      %rcx
+#define R3E     %ecx
+#define R3X     %cx
+#define R3H     %ch
+#define R3L     %cl
+#define R4      %rdx
+#define R4E     %edx
+#define R4X     %dx
+#define R4H     %dh
+#define R4L     %dl
+#define R5      %rsi
+#define R5E     %esi
+#define R6      %rdi
+#define R6E     %edi
+#define R7      %rbp
+#define R7E     %ebp
+#define R8      %r8
+#define R9      %r9
+#define R10     %r10
+#define R11     %r11
+#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
+        .global FUNC;                   \
+        .type   FUNC,@function;         \
+        .align  8;                      \
+FUNC:   movq    r1,r2;                  \
+        movq    r3,r4;                  \
+        leaq    BASE+KEY+52(r8),r9;     \
+        movq    r10,r11;                \
+        movl    (r7),r5 ## E;           \
+        movl    4(r7),r1 ## E;          \
+        movl    8(r7),r6 ## E;          \
+        movl    12(r7),r7 ## E;         \
+        movl    BASE(r8),r10 ## E;      \
+        xorl    -48(r9),r5 ## E;        \
+        xorl    -44(r9),r1 ## E;        \
+        xorl    -40(r9),r6 ## E;        \
+        xorl    -36(r9),r7 ## E;        \
+        cmpl    $24,r10 ## E;           \
+        jb      B128;                   \
+        leaq    32(r9),r9;              \
+        je      B192;                   \
+        leaq    32(r9),r9;
+#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
+        movq    r1,r2;                  \
+        movq    r3,r4;                  \
+        movl    r5 ## E,(r9);           \
+        movl    r6 ## E,4(r9);          \
+        movl    r7 ## E,8(r9);          \
+        movl    r8 ## E,12(r9);         \
+        ret;
+#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
+        movzbl  r2 ## H,r5 ## E;        \
+        movzbl  r2 ## L,r6 ## E;        \
+        movl    TAB+1024(,r5,4),r5 ## E;\
+        movw    r4 ## X,r2 ## X;        \
+        movl    TAB(,r6,4),r6 ## E;     \
+        roll    $16,r2 ## E;            \
+        shrl    $16,r4 ## E;            \
+        movzbl  r4 ## H,r7 ## E;        \
+        movzbl  r4 ## L,r4 ## E;        \
+        xorl    OFFSET(r8),ra ## E;     \
+        xorl    OFFSET+4(r8),rb ## E;   \
+        xorl    TAB+3072(,r7,4),r5 ## E;\
+        xorl    TAB+2048(,r4,4),r6 ## E;\
+        movzbl  r1 ## L,r7 ## E;        \
+        movzbl  r1 ## H,r4 ## E;        \
+        movl    TAB+1024(,r4,4),r4 ## E;\
+        movw    r3 ## X,r1 ## X;        \
+        roll    $16,r1 ## E;            \
+        shrl    $16,r3 ## E;            \
+        xorl    TAB(,r7,4),r5 ## E;     \
+        movzbl  r3 ## H,r7 ## E;        \
+        movzbl  r3 ## L,r3 ## E;        \
+        xorl    TAB+3072(,r7,4),r4 ## E;\
+        xorl    TAB+2048(,r3,4),r5 ## E;\
+        movzbl  r1 ## H,r7 ## E;        \
+        movzbl  r1 ## L,r3 ## E;        \
+        shrl    $16,r1 ## E;            \
+        xorl    TAB+3072(,r7,4),r6 ## E;\
+        movl    TAB+2048(,r3,4),r3 ## E;\
+        movzbl  r1 ## H,r7 ## E;        \
+        movzbl  r1 ## L,r1 ## E;        \
+        xorl    TAB+1024(,r7,4),r6 ## E;\
+        xorl    TAB(,r1,4),r3 ## E;     \
+        movzbl  r2 ## H,r1 ## E;        \
+        movzbl  r2 ## L,r7 ## E;        \
+        shrl    $16,r2 ## E;            \
+        xorl    TAB+3072(,r1,4),r3 ## E;\
+        xorl    TAB+2048(,r7,4),r4 ## E;\
+        movzbl  r2 ## H,r1 ## E;        \
+        movzbl  r2 ## L,r2 ## E;        \
+        xorl    OFFSET+8(r8),rc ## E;   \
+        xorl    OFFSET+12(r8),rd ## E;  \
+        xorl    TAB+1024(,r1,4),r3 ## E;\
+        xorl    TAB(,r2,4),r4 ## E;
+#define move_regs(r1,r2,r3,r4) \
+        movl    r3 ## E,r1 ## E;        \
+        movl    r4 ## E,r2 ## E;
+#define entry(FUNC,KEY,B128,B192) \
+        prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
+#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
+#define encrypt_round(TAB,OFFSET) \
+        round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
+        move_regs(R1,R2,R5,R6)
+#define encrypt_final(TAB,OFFSET) \
+        round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4)
+#define decrypt_round(TAB,OFFSET) \
+        round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \
+        move_regs(R1,R2,R5,R6)
+#define decrypt_final(TAB,OFFSET) \
+        round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
+/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
+        entry(aes_enc_blk,0,enc128,enc192)
+        encrypt_round(aes_ft_tab,-96)
+        encrypt_round(aes_ft_tab,-80)
+enc192: encrypt_round(aes_ft_tab,-64)
+        encrypt_round(aes_ft_tab,-48)
+enc128: encrypt_round(aes_ft_tab,-32)
+        encrypt_round(aes_ft_tab,-16)
+        encrypt_round(aes_ft_tab,  0)
+        encrypt_round(aes_ft_tab, 16)
+        encrypt_round(aes_ft_tab, 32)
+        encrypt_round(aes_ft_tab, 48)
+        encrypt_round(aes_ft_tab, 64)
+        encrypt_round(aes_ft_tab, 80)
+        encrypt_round(aes_ft_tab, 96)
+        encrypt_final(aes_fl_tab,112)
+        return
+/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */
+        entry(aes_dec_blk,240,dec128,dec192)
+        decrypt_round(aes_it_tab,-96)
+        decrypt_round(aes_it_tab,-80)
+dec192: decrypt_round(aes_it_tab,-64)
+        decrypt_round(aes_it_tab,-48)
+dec128: decrypt_round(aes_it_tab,-32)
+        decrypt_round(aes_it_tab,-16)
+        decrypt_round(aes_it_tab,  0)
+        decrypt_round(aes_it_tab, 16)
+        decrypt_round(aes_it_tab, 32)
+        decrypt_round(aes_it_tab, 48)
+        decrypt_round(aes_it_tab, 64)
+        decrypt_round(aes_it_tab, 80)
+        decrypt_round(aes_it_tab, 96)
+        decrypt_final(aes_il_tab,112)
+        return
diff --git a/arch/x86/crypto/aes_32.c b/arch/x86/crypto/aes_32.c
new file mode 100644
index 000000000000..49aad9397f10
--- /dev/null
+++ b/arch/x86/crypto/aes_32.c
@@ -0,0 +1,515 @@
+/* 
+ * 
+ * Glue Code for optimized 586 assembler version of AES
+ *
+ * Copyright (c) 2002, Dr Brian Gladman <>, Worcester, UK.
+ * All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software in both source and binary
+ * form is allowed (with or without changes) provided that:
+ *
+ *   1. distributions of this source code include the above copyright
+ *      notice, this list of conditions and the following disclaimer;
+ *
+ *   2. distributions in binary form include the above copyright
+ *      notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other associated materials;
+ *
+ *   3. the copyright holder's name is not used to endorse products
+ *      built using this software without specific written permission.
+ *
+ * ALTERNATIVELY, provided that this notice is retained in full, this product
+ * may be distributed under the terms of the GNU General Public License (GPL),
+ * in which case the provisions of the GPL apply INSTEAD OF those given above.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ *
+ * Copyright (c) 2003, Adam J. Richter <adam@yggdrasil.com> (conversion to
+ * 2.5 API).
+ * Copyright (c) 2003, 2004 Fruhwirth Clemens <clemens@endorphin.org>
+ * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ */
+#include <asm/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/linkage.h>
+asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+#define AES_MIN_KEY_SIZE        16
+#define AES_MAX_KEY_SIZE        32
+#define AES_BLOCK_SIZE          16
+#define AES_KS_LENGTH           4 * AES_BLOCK_SIZE
+#define RC_LENGTH               29
+struct aes_ctx {
+        u32 ekey[AES_KS_LENGTH];
+        u32 rounds;
+        u32 dkey[AES_KS_LENGTH];
+};
+#define WPOLY 0x011b
+#define bytes2word(b0, b1, b2, b3)  \
+        (((u32)(b3) << 24) | ((u32)(b2) << 16) | ((u32)(b1) << 8) | (b0))
+/* define the finite field multiplies required for Rijndael */
+#define f2(x) ((x) ? pow[log[x] + 0x19] : 0)
+#define f3(x) ((x) ? pow[log[x] + 0x01] : 0)
+#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0)
+#define fb(x) ((x) ? pow[log[x] + 0x68] : 0)
+#define fd(x) ((x) ? pow[log[x] + 0xee] : 0)
+#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0)
+#define fi(x) ((x) ?   pow[255 - log[x]]: 0)
+static inline u32 upr(u32 x, int n)
+{
+        return (x << 8 * n) | (x >> (32 - 8 * n));
+}
+static inline u8 bval(u32 x, int n)
+{
+        return x >> 8 * n;
+}
+/* The forward and inverse affine transformations used in the S-box */
+#define fwd_affine(x) \
+        (w = (u32)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(u8)(w^(w>>8)))
+#define inv_affine(x) \
+        (w = (u32)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(u8)(w^(w>>8)))
+static u32 rcon_tab[RC_LENGTH];
+u32 ft_tab[4][256];
+u32 fl_tab[4][256];
+static u32 im_tab[4][256];
+u32 il_tab[4][256];
+u32 it_tab[4][256];
+static void gen_tabs(void)
+{
+        u32 i, w;
+        u8 pow[512], log[256];
+        /*
+         * log and power tables for GF(2^8) finite field with
+         * WPOLY as modular polynomial - the simplest primitive
+         * root is 0x03, used here to generate the tables.
+         */
+        i = 0; w = 1; 
+        
+        do {
+                pow[i] = (u8)w;
+                pow[i + 255] = (u8)w;
+                log[w] = (u8)i++;
+                w ^=  (w << 1) ^ (w & 0x80 ? WPOLY : 0);
+        } while (w != 1);
+        
+        for(i = 0, w = 1; i < RC_LENGTH; ++i) {
+                rcon_tab[i] = bytes2word(w, 0, 0, 0);
+                w = f2(w);
+        }
+        for(i = 0; i < 256; ++i) {
+                u8 b;
+                
+                b = fwd_affine(fi((u8)i));
+                w = bytes2word(f2(b), b, b, f3(b));
+                /* tables for a normal encryption round */
+                ft_tab[0][i] = w;
+                ft_tab[1][i] = upr(w, 1);
+                ft_tab[2][i] = upr(w, 2);
+                ft_tab[3][i] = upr(w, 3);
+                w = bytes2word(b, 0, 0, 0);
+                
+                /*
+                 * tables for last encryption round
+                 * (may also be used in the key schedule)
+                 */
+                fl_tab[0][i] = w;
+                fl_tab[1][i] = upr(w, 1);
+                fl_tab[2][i] = upr(w, 2);
+                fl_tab[3][i] = upr(w, 3);
+                
+                b = fi(inv_affine((u8)i));
+                w = bytes2word(fe(b), f9(b), fd(b), fb(b));
+                /* tables for the inverse mix column operation  */
+                im_tab[0][b] = w;
+                im_tab[1][b] = upr(w, 1);
+                im_tab[2][b] = upr(w, 2);
+                im_tab[3][b] = upr(w, 3);
+                /* tables for a normal decryption round */
+                it_tab[0][i] = w;
+                it_tab[1][i] = upr(w,1);
+                it_tab[2][i] = upr(w,2);
+                it_tab[3][i] = upr(w,3);
+                w = bytes2word(b, 0, 0, 0);
+                
+                /* tables for last decryption round */
+                il_tab[0][i] = w;
+                il_tab[1][i] = upr(w,1);
+                il_tab[2][i] = upr(w,2);
+                il_tab[3][i] = upr(w,3);
+    }
+}
+#define four_tables(x,tab,vf,rf,c)              \
+(       tab[0][bval(vf(x,0,c),rf(0,c))] ^       \
+        tab[1][bval(vf(x,1,c),rf(1,c))] ^       \
+        tab[2][bval(vf(x,2,c),rf(2,c))] ^       \
+        tab[3][bval(vf(x,3,c),rf(3,c))]         \
+)
+#define vf1(x,r,c)  (x)
+#define rf1(r,c)    (r)
+#define rf2(r,c)    ((r-c)&3)
+#define inv_mcol(x) four_tables(x,im_tab,vf1,rf1,0)
+#define ls_box(x,c) four_tables(x,fl_tab,vf1,rf2,c)
+#define ff(x) inv_mcol(x)
+#define ke4(k,i)                                                        \
+{                                                                       \
+        k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i];            \
+        k[4*(i)+5] = ss[1] ^= ss[0];                                    \
+        k[4*(i)+6] = ss[2] ^= ss[1];                                    \
+        k[4*(i)+7] = ss[3] ^= ss[2];                                    \
+}
+#define kel4(k,i)                                                       \
+{                                                                       \
+        k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i];            \
+        k[4*(i)+5] = ss[1] ^= ss[0];                                    \
+        k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2];       \
+}
+#define ke6(k,i)                                                        \
+{                                                                       \
+        k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];           \
+        k[6*(i)+ 7] = ss[1] ^= ss[0];                                   \
+        k[6*(i)+ 8] = ss[2] ^= ss[1];                                   \
+        k[6*(i)+ 9] = ss[3] ^= ss[2];                                   \
+        k[6*(i)+10] = ss[4] ^= ss[3];                                   \
+        k[6*(i)+11] = ss[5] ^= ss[4];                                   \
+}
+#define kel6(k,i)                                                       \
+{                                                                       \
+        k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];           \
+        k[6*(i)+ 7] = ss[1] ^= ss[0];                                   \
+        k[6*(i)+ 8] = ss[2] ^= ss[1];                                   \
+        k[6*(i)+ 9] = ss[3] ^= ss[2];                                   \
+}
+#define ke8(k,i)                                                        \
+{                                                                       \
+        k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];           \
+        k[8*(i)+ 9] = ss[1] ^= ss[0];                                   \
+        k[8*(i)+10] = ss[2] ^= ss[1];                                   \
+        k[8*(i)+11] = ss[3] ^= ss[2];                                   \
+        k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0);                         \
+        k[8*(i)+13] = ss[5] ^= ss[4];                                   \
+        k[8*(i)+14] = ss[6] ^= ss[5];                                   \
+        k[8*(i)+15] = ss[7] ^= ss[6];                                   \
+}
+#define kel8(k,i)                                                       \
+{                                                                       \
+        k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];           \
+        k[8*(i)+ 9] = ss[1] ^= ss[0];                                   \
+        k[8*(i)+10] = ss[2] ^= ss[1];                                   \
+        k[8*(i)+11] = ss[3] ^= ss[2];                                   \
+}
+#define kdf4(k,i)                                                       \
+{                                                                       \
+        ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3];                          \
+        ss[1] = ss[1] ^ ss[3];                                          \
+        ss[2] = ss[2] ^ ss[3];                                          \
+        ss[3] = ss[3];                                                  \
+        ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];                 \
+        ss[i % 4] ^= ss[4];                                             \
+        ss[4] ^= k[4*(i)];                                              \
+        k[4*(i)+4] = ff(ss[4]);                                         \
+        ss[4] ^= k[4*(i)+1];                                            \
+        k[4*(i)+5] = ff(ss[4]);                                         \
+        ss[4] ^= k[4*(i)+2];                                            \
+        k[4*(i)+6] = ff(ss[4]);                                         \
+        ss[4] ^= k[4*(i)+3];                                            \
+        k[4*(i)+7] = ff(ss[4]);                                         \
+}
+#define kd4(k,i)                                                        \
+{                                                                       \
+        ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];                 \
+        ss[i % 4] ^= ss[4];                                             \
+        ss[4] = ff(ss[4]);                                              \
+        k[4*(i)+4] = ss[4] ^= k[4*(i)];                                 \
+        k[4*(i)+5] = ss[4] ^= k[4*(i)+1];                               \
+        k[4*(i)+6] = ss[4] ^= k[4*(i)+2];                               \
+        k[4*(i)+7] = ss[4] ^= k[4*(i)+3];                               \
+}
+#define kdl4(k,i)                                                       \
+{                                                                       \
+        ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];                 \
+        ss[i % 4] ^= ss[4];                                             \
+        k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3];                  \
+        k[4*(i)+5] = ss[1] ^ ss[3];                                     \
+        k[4*(i)+6] = ss[0];                                             \
+        k[4*(i)+7] = ss[1];                                             \
+}
+#define kdf6(k,i)                                                       \
+{                                                                       \
+        ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];                         \
+        k[6*(i)+ 6] = ff(ss[0]);                                        \
+        ss[1] ^= ss[0];                                                 \
+        k[6*(i)+ 7] = ff(ss[1]);                                        \
+        ss[2] ^= ss[1];                                                 \
+        k[6*(i)+ 8] = ff(ss[2]);                                        \
+        ss[3] ^= ss[2];                                                 \
+        k[6*(i)+ 9] = ff(ss[3]);                                        \
+        ss[4] ^= ss[3];                                                 \
+        k[6*(i)+10] = ff(ss[4]);                                        \
+        ss[5] ^= ss[4];                                                 \
+        k[6*(i)+11] = ff(ss[5]);                                        \
+}
+#define kd6(k,i)                                                        \
+{                                                                       \
+        ss[6] = ls_box(ss[5],3) ^ rcon_tab[i];                          \
+        ss[0] ^= ss[6]; ss[6] = ff(ss[6]);                              \
+        k[6*(i)+ 6] = ss[6] ^= k[6*(i)];                                \
+        ss[1] ^= ss[0];                                                 \
+        k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1];                             \
+        ss[2] ^= ss[1];                                                 \
+        k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2];                             \
+        ss[3] ^= ss[2];                                                 \
+        k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3];                             \
+        ss[4] ^= ss[3];                                                 \
+        k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4];                             \
+        ss[5] ^= ss[4];                                                 \
+        k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5];                             \
+}
+#define kdl6(k,i)                                                       \
+{                                                                       \
+        ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];                         \
+        k[6*(i)+ 6] = ss[0];                                            \
+        ss[1] ^= ss[0];                                                 \
+        k[6*(i)+ 7] = ss[1];                                            \
+        ss[2] ^= ss[1];                                                 \
+        k[6*(i)+ 8] = ss[2];                                            \
+        ss[3] ^= ss[2];                                                 \
+        k[6*(i)+ 9] = ss[3];                                            \
+}
+#define kdf8(k,i)                                                       \
+{                                                                       \
+        ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];                         \
+        k[8*(i)+ 8] = ff(ss[0]);                                        \
+        ss[1] ^= ss[0];                                                 \
+        k[8*(i)+ 9] = ff(ss[1]);                                        \
+        ss[2] ^= ss[1];                                                 \
+        k[8*(i)+10] = ff(ss[2]);                                        \
+        ss[3] ^= ss[2];                                                 \
+        k[8*(i)+11] = ff(ss[3]);                                        \
+        ss[4] ^= ls_box(ss[3],0);                                       \
+        k[8*(i)+12] = ff(ss[4]);                                        \
+        ss[5] ^= ss[4];                                                 \
+        k[8*(i)+13] = ff(ss[5]);                                        \
+        ss[6] ^= ss[5];                                                 \
+        k[8*(i)+14] = ff(ss[6]);                                        \
+        ss[7] ^= ss[6];                                                 \
+        k[8*(i)+15] = ff(ss[7]);                                        \
+}
+#define kd8(k,i)                                                        \
+{                                                                       \
+        u32 __g = ls_box(ss[7],3) ^ rcon_tab[i];                        \
+        ss[0] ^= __g;                                                   \
+        __g = ff(__g);                                                  \
+        k[8*(i)+ 8] = __g ^= k[8*(i)];                                  \
+        ss[1] ^= ss[0];                                                 \
+        k[8*(i)+ 9] = __g ^= k[8*(i)+ 1];                               \
+        ss[2] ^= ss[1];                                                 \
+        k[8*(i)+10] = __g ^= k[8*(i)+ 2];                               \
+        ss[3] ^= ss[2];                                                 \
+        k[8*(i)+11] = __g ^= k[8*(i)+ 3];                               \
+        __g = ls_box(ss[3],0);                                          \
+        ss[4] ^= __g;                                                   \
+        __g = ff(__g);                                                  \
+        k[8*(i)+12] = __g ^= k[8*(i)+ 4];                               \
+        ss[5] ^= ss[4];                                                 \
+        k[8*(i)+13] = __g ^= k[8*(i)+ 5];                               \
+        ss[6] ^= ss[5];                                                 \
+        k[8*(i)+14] = __g ^= k[8*(i)+ 6];                               \
+        ss[7] ^= ss[6];                                                 \
+        k[8*(i)+15] = __g ^= k[8*(i)+ 7];                               \
+}
+#define kdl8(k,i)                                                       \
+{                                                                       \
+        ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];                         \
+        k[8*(i)+ 8] = ss[0];                                            \
+        ss[1] ^= ss[0];                                                 \
+        k[8*(i)+ 9] = ss[1];                                            \
+        ss[2] ^= ss[1];                                                 \
+        k[8*(i)+10] = ss[2];                                            \
+        ss[3] ^= ss[2];                                                 \
+        k[8*(i)+11] = ss[3];                                            \
+}
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                       unsigned int key_len)
+{
+        int i;
+        u32 ss[8];
+        struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
+        const __le32 *key = (const __le32 *)in_key;
+        u32 *flags = &tfm->crt_flags;
+        /* encryption schedule */
+        
+        ctx->ekey[0] = ss[0] = le32_to_cpu(key[0]);
+        ctx->ekey[1] = ss[1] = le32_to_cpu(key[1]);
+        ctx->ekey[2] = ss[2] = le32_to_cpu(key[2]);
+        ctx->ekey[3] = ss[3] = le32_to_cpu(key[3]);
+        switch(key_len) {
+        case 16:
+                for (i = 0; i < 9; i++)
+                        ke4(ctx->ekey, i);
+                kel4(ctx->ekey, 9);
+                ctx->rounds = 10;
+                break;
+                
+        case 24:
+                ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
+                ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
+                for (i = 0; i < 7; i++)
+                        ke6(ctx->ekey, i);
+                kel6(ctx->ekey, 7); 
+                ctx->rounds = 12;
+                break;
+        case 32:
+                ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
+                ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
+                ctx->ekey[6] = ss[6] = le32_to_cpu(key[6]);
+                ctx->ekey[7] = ss[7] = le32_to_cpu(key[7]);
+                for (i = 0; i < 6; i++)
+                        ke8(ctx->ekey, i);
+                kel8(ctx->ekey, 6);
+                ctx->rounds = 14;
+                break;
+        default:
+                *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+                return -EINVAL;
+        }
+        
+        /* decryption schedule */
+        
+        ctx->dkey[0] = ss[0] = le32_to_cpu(key[0]);
+        ctx->dkey[1] = ss[1] = le32_to_cpu(key[1]);
+        ctx->dkey[2] = ss[2] = le32_to_cpu(key[2]);
+        ctx->dkey[3] = ss[3] = le32_to_cpu(key[3]);
+        switch (key_len) {
+        case 16:
+                kdf4(ctx->dkey, 0);
+                for (i = 1; i < 9; i++)
+                        kd4(ctx->dkey, i);
+                kdl4(ctx->dkey, 9);
+                break;
+                
+        case 24:
+                ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
+                ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
+                kdf6(ctx->dkey, 0);
+                for (i = 1; i < 7; i++)
+                        kd6(ctx->dkey, i);
+                kdl6(ctx->dkey, 7);
+                break;
+        case 32:
+                ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
+                ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
+                ctx->dkey[6] = ff(ss[6] = le32_to_cpu(key[6]));
+                ctx->dkey[7] = ff(ss[7] = le32_to_cpu(key[7]));
+                kdf8(ctx->dkey, 0);
+                for (i = 1; i < 6; i++)
+                        kd8(ctx->dkey, i);
+                kdl8(ctx->dkey, 6);
+                break;
+        }
+        return 0;
+}
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+        aes_enc_blk(tfm, dst, src);
+}
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+        aes_dec_blk(tfm, dst, src);
+}
+static struct crypto_alg aes_alg = {
+        .cra_name               =       "aes",
+        .cra_driver_name        =       "aes-i586",
+        .cra_priority           =       200,
+        .cra_flags              =       CRYPTO_ALG_TYPE_CIPHER,
+        .cra_blocksize          =       AES_BLOCK_SIZE,
+        .cra_ctxsize            =       sizeof(struct aes_ctx),
+        .cra_module             =       THIS_MODULE,
+        .cra_list               =       LIST_HEAD_INIT(aes_alg.cra_list),
+        .cra_u                  =       {
+                .cipher = {
+                        .cia_min_keysize        =       AES_MIN_KEY_SIZE,
+                        .cia_max_keysize        =       AES_MAX_KEY_SIZE,
+                        .cia_setkey             =       aes_set_key,
+                        .cia_encrypt            =       aes_encrypt,
+                        .cia_decrypt            =       aes_decrypt
+                }
+        }
+};
+static int __init aes_init(void)
+{
+        gen_tabs();
+        return crypto_register_alg(&aes_alg);
+}
+static void __exit aes_fini(void)
+{
+        crypto_unregister_alg(&aes_alg);
+}
+module_init(aes_init);
+module_exit(aes_fini);
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, i586 asm optimized");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Fruhwirth Clemens, James Morris, Brian Gladman, Adam Richter");
+MODULE_ALIAS("aes");
diff --git a/arch/x86/crypto/aes_64.c b/arch/x86/crypto/aes_64.c
new file mode 100644
index 000000000000..5cdb13ea5cc2
--- /dev/null
+++ b/arch/x86/crypto/aes_64.c
@@ -0,0 +1,336 @@
+/*
+ * Cryptographic API.
+ *
+ * AES Cipher Algorithm.
+ *
+ * Based on Brian Gladman's code.
+ *
+ * Linux developers:
+ *  Alexander Kjeldaas <astor@fast.no>
+ *  Herbert Valerio Riedel <hvr@hvrlab.org>
+ *  Kyle McMartin <kyle@debian.org>
+ *  Adam J. Richter <adam@yggdrasil.com> (conversion to 2.5 API).
+ *  Andreas Steinmetz <ast@domdv.de> (adapted to x86_64 assembler)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK.
+ * All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software in both source and binary
+ * form is allowed (with or without changes) provided that:
+ *
+ *   1. distributions of this source code include the above copyright
+ *      notice, this list of conditions and the following disclaimer;
+ *
+ *   2. distributions in binary form include the above copyright
+ *      notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other associated materials;
+ *
+ *   3. the copyright holder's name is not used to endorse products
+ *      built using this software without specific written permission.
+ *
+ * ALTERNATIVELY, provided that this notice is retained in full, this product
+ * may be distributed under the terms of the GNU General Public License (GPL),
+ * in which case the provisions of the GPL apply INSTEAD OF those given above.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ */
+/* Some changes from the Gladman version:
+    s/RIJNDAEL(e_key)/E_KEY/g
+    s/RIJNDAEL(d_key)/D_KEY/g
+*/
+#include <asm/byteorder.h>
+#include <linux/bitops.h>
+#include <linux/crypto.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#define AES_MIN_KEY_SIZE        16
+#define AES_MAX_KEY_SIZE        32
+#define AES_BLOCK_SIZE          16
+/*
+ * #define byte(x, nr) ((unsigned char)((x) >> (nr*8)))
+ */
+static inline u8 byte(const u32 x, const unsigned n)
+{
+        return x >> (n << 3);
+}
+struct aes_ctx
+{
+        u32 key_length;
+        u32 buf[120];
+};
+#define E_KEY (&ctx->buf[0])
+#define D_KEY (&ctx->buf[60])
+static u8 pow_tab[256] __initdata;
+static u8 log_tab[256] __initdata;
+static u8 sbx_tab[256] __initdata;
+static u8 isb_tab[256] __initdata;
+static u32 rco_tab[10];
+u32 aes_ft_tab[4][256];
+u32 aes_it_tab[4][256];
+u32 aes_fl_tab[4][256];
+u32 aes_il_tab[4][256];
+static inline u8 f_mult(u8 a, u8 b)
+{
+        u8 aa = log_tab[a], cc = aa + log_tab[b];
+        return pow_tab[cc + (cc < aa ? 1 : 0)];
+}
+#define ff_mult(a, b) (a && b ? f_mult(a, b) : 0)
+#define ls_box(x)                               \
+        (aes_fl_tab[0][byte(x, 0)] ^            \
+         aes_fl_tab[1][byte(x, 1)] ^            \
+         aes_fl_tab[2][byte(x, 2)] ^            \
+         aes_fl_tab[3][byte(x, 3)])
+static void __init gen_tabs(void)
+{
+        u32 i, t;
+        u8 p, q;
+        /* log and power tables for GF(2**8) finite field with
+           0x011b as modular polynomial - the simplest primitive
+           root is 0x03, used here to generate the tables */
+        for (i = 0, p = 1; i < 256; ++i) {
+                pow_tab[i] = (u8)p;
+                log_tab[p] = (u8)i;
+                p ^= (p << 1) ^ (p & 0x80 ? 0x01b : 0);
+        }
+        log_tab[1] = 0;
+        for (i = 0, p = 1; i < 10; ++i) {
+                rco_tab[i] = p;
+                p = (p << 1) ^ (p & 0x80 ? 0x01b : 0);
+        }
+        for (i = 0; i < 256; ++i) {
+                p = (i ? pow_tab[255 - log_tab[i]] : 0);
+                q = ((p >> 7) | (p << 1)) ^ ((p >> 6) | (p << 2));
+                p ^= 0x63 ^ q ^ ((q >> 6) | (q << 2));
+                sbx_tab[i] = p;
+                isb_tab[p] = (u8)i;
+        }
+        for (i = 0; i < 256; ++i) {
+                p = sbx_tab[i];
+                t = p;
+                aes_fl_tab[0][i] = t;
+                aes_fl_tab[1][i] = rol32(t, 8);
+                aes_fl_tab[2][i] = rol32(t, 16);
+                aes_fl_tab[3][i] = rol32(t, 24);
+                t = ((u32)ff_mult(2, p)) |
+                    ((u32)p << 8) |
+                    ((u32)p << 16) | ((u32)ff_mult(3, p) << 24);
+                aes_ft_tab[0][i] = t;
+                aes_ft_tab[1][i] = rol32(t, 8);
+                aes_ft_tab[2][i] = rol32(t, 16);
+                aes_ft_tab[3][i] = rol32(t, 24);
+                p = isb_tab[i];
+                t = p;
+                aes_il_tab[0][i] = t;
+                aes_il_tab[1][i] = rol32(t, 8);
+                aes_il_tab[2][i] = rol32(t, 16);
+                aes_il_tab[3][i] = rol32(t, 24);
+                t = ((u32)ff_mult(14, p)) |
+                    ((u32)ff_mult(9, p) << 8) |
+                    ((u32)ff_mult(13, p) << 16) |
+                    ((u32)ff_mult(11, p) << 24);
+                aes_it_tab[0][i] = t;
+                aes_it_tab[1][i] = rol32(t, 8);
+                aes_it_tab[2][i] = rol32(t, 16);
+                aes_it_tab[3][i] = rol32(t, 24);
+        }
+}
+#define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b)
+#define imix_col(y, x)                  \
+        u    = star_x(x);               \
+        v    = star_x(u);               \
+        w    = star_x(v);               \
+        t    = w ^ (x);                 \
+        (y)  = u ^ v ^ w;               \
+        (y) ^= ror32(u ^ t,  8) ^       \
+               ror32(v ^ t, 16) ^       \
+               ror32(t, 24)
+/* initialise the key schedule from the user supplied key */
+#define loop4(i)                                        \
+{                                                       \
+        t = ror32(t,  8); t = ls_box(t) ^ rco_tab[i];   \
+        t ^= E_KEY[4 * i];     E_KEY[4 * i + 4] = t;    \
+        t ^= E_KEY[4 * i + 1]; E_KEY[4 * i + 5] = t;    \
+        t ^= E_KEY[4 * i + 2]; E_KEY[4 * i + 6] = t;    \
+        t ^= E_KEY[4 * i + 3]; E_KEY[4 * i + 7] = t;    \
+}
+#define loop6(i)                                        \
+{                                                       \
+        t = ror32(t,  8); t = ls_box(t) ^ rco_tab[i];   \
+        t ^= E_KEY[6 * i];     E_KEY[6 * i + 6] = t;    \
+        t ^= E_KEY[6 * i + 1]; E_KEY[6 * i + 7] = t;    \
+        t ^= E_KEY[6 * i + 2]; E_KEY[6 * i + 8] = t;    \
+        t ^= E_KEY[6 * i + 3]; E_KEY[6 * i + 9] = t;    \
+        t ^= E_KEY[6 * i + 4]; E_KEY[6 * i + 10] = t;   \
+        t ^= E_KEY[6 * i + 5]; E_KEY[6 * i + 11] = t;   \
+}
+#define loop8(i)                                        \
+{                                                       \
+        t = ror32(t,  8); ; t = ls_box(t) ^ rco_tab[i]; \
+        t ^= E_KEY[8 * i];     E_KEY[8 * i + 8] = t;    \
+        t ^= E_KEY[8 * i + 1]; E_KEY[8 * i + 9] = t;    \
+        t ^= E_KEY[8 * i + 2]; E_KEY[8 * i + 10] = t;   \
+        t ^= E_KEY[8 * i + 3]; E_KEY[8 * i + 11] = t;   \
+        t  = E_KEY[8 * i + 4] ^ ls_box(t);              \
+        E_KEY[8 * i + 12] = t;                          \
+        t ^= E_KEY[8 * i + 5]; E_KEY[8 * i + 13] = t;   \
+        t ^= E_KEY[8 * i + 6]; E_KEY[8 * i + 14] = t;   \
+        t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t;   \
+}
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+                       unsigned int key_len)
+{
+        struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
+        const __le32 *key = (const __le32 *)in_key;
+        u32 *flags = &tfm->crt_flags;
+        u32 i, j, t, u, v, w;
+        if (key_len % 8) {
+                *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+                return -EINVAL;
+        }
+        ctx->key_length = key_len;
+        D_KEY[key_len + 24] = E_KEY[0] = le32_to_cpu(key[0]);
+        D_KEY[key_len + 25] = E_KEY[1] = le32_to_cpu(key[1]);
+        D_KEY[key_len + 26] = E_KEY[2] = le32_to_cpu(key[2]);
+        D_KEY[key_len + 27] = E_KEY[3] = le32_to_cpu(key[3]);
+        switch (key_len) {
+        case 16:
+                t = E_KEY[3];
+                for (i = 0; i < 10; ++i)
+                        loop4(i);
+                break;
+        case 24:
+                E_KEY[4] = le32_to_cpu(key[4]);
+                t = E_KEY[5] = le32_to_cpu(key[5]);
+                for (i = 0; i < 8; ++i)
+                        loop6 (i);
+                break;
+        case 32:
+                E_KEY[4] = le32_to_cpu(key[4]);
+                E_KEY[5] = le32_to_cpu(key[5]);
+                E_KEY[6] = le32_to_cpu(key[6]);
+                t = E_KEY[7] = le32_to_cpu(key[7]);
+                for (i = 0; i < 7; ++i)
+                        loop8(i);
+                break;
+        }
+        D_KEY[0] = E_KEY[key_len + 24];
+        D_KEY[1] = E_KEY[key_len + 25];
+        D_KEY[2] = E_KEY[key_len + 26];
+        D_KEY[3] = E_KEY[key_len + 27];
+        for (i = 4; i < key_len + 24; ++i) {
+                j = key_len + 24 - (i & ~3) + (i & 3);
+                imix_col(D_KEY[j], E_KEY[i]);
+        }
+        return 0;
+}
+asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+        aes_enc_blk(tfm, dst, src);
+}
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+        aes_dec_blk(tfm, dst, src);
+}
+static struct crypto_alg aes_alg = {
+        .cra_name               =       "aes",
+        .cra_driver_name        =       "aes-x86_64",
+        .cra_priority           =       200,
+        .cra_flags              =       CRYPTO_ALG_TYPE_CIPHER,
+        .cra_blocksize          =       AES_BLOCK_SIZE,
+        .cra_ctxsize            =       sizeof(struct aes_ctx),
+        .cra_module             =       THIS_MODULE,
+        .cra_list               =       LIST_HEAD_INIT(aes_alg.cra_list),
+        .cra_u                  =       {
+                .cipher = {
+                        .cia_min_keysize        =       AES_MIN_KEY_SIZE,
+                        .cia_max_keysize        =       AES_MAX_KEY_SIZE,
+                        .cia_setkey             =       aes_set_key,
+                        .cia_encrypt            =       aes_encrypt,
+                        .cia_decrypt            =       aes_decrypt
+                }
+        }
+};
+static int __init aes_init(void)
+{
+        gen_tabs();
+        return crypto_register_alg(&aes_alg);
+}
+static void __exit aes_fini(void)
+{
+        crypto_unregister_alg(&aes_alg);
+}
+module_init(aes_init);
+module_exit(aes_fini);
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("aes");
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
new file mode 100644
index 000000000000..39b98ed2c1b9
--- /dev/null
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -0,0 +1,335 @@
+/***************************************************************************
+*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+***************************************************************************/
+.file "twofish-i586-asm.S"
+.text
+#include <asm/asm-offsets.h>
+/* return adress at 0 */
+#define in_blk    12  /* input byte array address parameter*/
+#define out_blk   8  /* output byte array address parameter*/
+#define tfm       4  /* Twofish context structure */
+#define a_offset        0
+#define b_offset        4
+#define c_offset        8
+#define d_offset        12
+/* Structure of the crypto context struct*/
+#define s0      0       /* S0 Array 256 Words each */
+#define s1      1024    /* S1 Array */
+#define s2      2048    /* S2 Array */
+#define s3      3072    /* S3 Array */
+#define w       4096    /* 8 whitening keys (word) */
+#define k       4128    /* key 1-32 ( word ) */
+/* define a few register aliases to allow macro substitution */
+#define R0D    %eax
+#define R0B    %al
+#define R0H    %ah
+#define R1D    %ebx
+#define R1B    %bl
+#define R1H    %bh
+#define R2D    %ecx
+#define R2B    %cl
+#define R2H    %ch
+#define R3D    %edx
+#define R3B    %dl
+#define R3H    %dh
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+        xor     w+offset(context),      src;
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+        xor     w+16+offset(context),   src;
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define encrypt_round(a,b,c,d,round)\
+        push    d ## D;\
+        movzx   b ## B,         %edi;\
+        mov     s1(%ebp,%edi,4),d ## D;\
+        movzx   a ## B,         %edi;\
+        mov     s2(%ebp,%edi,4),%esi;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     s2(%ebp,%edi,4),d ## D;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s3(%ebp,%edi,4),%esi;\
+        movzx   b ## B,         %edi;\
+        xor     s3(%ebp,%edi,4),d ## D;\
+        movzx   a ## B,         %edi;\
+        xor     (%ebp,%edi,4),  %esi;\
+        movzx   b ## H,         %edi;\
+        ror     $15,            b ## D;\
+        xor     (%ebp,%edi,4),  d ## D;\
+        movzx   a ## H,         %edi;\
+        xor     s1(%ebp,%edi,4),%esi;\
+        pop     %edi;\
+        add     d ## D,         %esi;\
+        add     %esi,           d ## D;\
+        add     k+round(%ebp),  %esi;\
+        xor     %esi,           c ## D;\
+        rol     $15,            c ## D;\
+        add     k+4+round(%ebp),d ## D;\
+        xor     %edi,           d ## D;
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * last round has different rotations for the output preparation
+ */
+#define encrypt_last_round(a,b,c,d,round)\
+        push    d ## D;\
+        movzx   b ## B,         %edi;\
+        mov     s1(%ebp,%edi,4),d ## D;\
+        movzx   a ## B,         %edi;\
+        mov     s2(%ebp,%edi,4),%esi;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     s2(%ebp,%edi,4),d ## D;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s3(%ebp,%edi,4),%esi;\
+        movzx   b ## B,         %edi;\
+        xor     s3(%ebp,%edi,4),d ## D;\
+        movzx   a ## B,         %edi;\
+        xor     (%ebp,%edi,4),  %esi;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     (%ebp,%edi,4),  d ## D;\
+        movzx   a ## H,         %edi;\
+        xor     s1(%ebp,%edi,4),%esi;\
+        pop     %edi;\
+        add     d ## D,         %esi;\
+        add     %esi,           d ## D;\
+        add     k+round(%ebp),  %esi;\
+        xor     %esi,           c ## D;\
+        ror     $1,             c ## D;\
+        add     k+4+round(%ebp),d ## D;\
+        xor     %edi,           d ## D;
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define decrypt_round(a,b,c,d,round)\
+        push    c ## D;\
+        movzx   a ## B,         %edi;\
+        mov     (%ebp,%edi,4),  c ## D;\
+        movzx   b ## B,         %edi;\
+        mov     s3(%ebp,%edi,4),%esi;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s1(%ebp,%edi,4),c ## D;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     (%ebp,%edi,4),  %esi;\
+        movzx   a ## B,         %edi;\
+        xor     s2(%ebp,%edi,4),c ## D;\
+        movzx   b ## B,         %edi;\
+        xor     s1(%ebp,%edi,4),%esi;\
+        movzx   a ## H,         %edi;\
+        ror     $15,            a ## D;\
+        xor     s3(%ebp,%edi,4),c ## D;\
+        movzx   b ## H,         %edi;\
+        xor     s2(%ebp,%edi,4),%esi;\
+        pop     %edi;\
+        add     %esi,           c ## D;\
+        add     c ## D,         %esi;\
+        add     k+round(%ebp),  c ## D;\
+        xor     %edi,           c ## D;\
+        add     k+4+round(%ebp),%esi;\
+        xor     %esi,           d ## D;\
+        rol     $15,            d ## D;
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * last round has different rotations for the output preparation
+ */
+#define decrypt_last_round(a,b,c,d,round)\
+        push    c ## D;\
+        movzx   a ## B,         %edi;\
+        mov     (%ebp,%edi,4),  c ## D;\
+        movzx   b ## B,         %edi;\
+        mov     s3(%ebp,%edi,4),%esi;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s1(%ebp,%edi,4),c ## D;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     (%ebp,%edi,4),  %esi;\
+        movzx   a ## B,         %edi;\
+        xor     s2(%ebp,%edi,4),c ## D;\
+        movzx   b ## B,         %edi;\
+        xor     s1(%ebp,%edi,4),%esi;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s3(%ebp,%edi,4),c ## D;\
+        movzx   b ## H,         %edi;\
+        xor     s2(%ebp,%edi,4),%esi;\
+        pop     %edi;\
+        add     %esi,           c ## D;\
+        add     c ## D,         %esi;\
+        add     k+round(%ebp),  c ## D;\
+        xor     %edi,           c ## D;\
+        add     k+4+round(%ebp),%esi;\
+        xor     %esi,           d ## D;\
+        ror     $1,             d ## D;
+.align 4
+.global twofish_enc_blk
+.global twofish_dec_blk
+twofish_enc_blk:
+        push    %ebp                    /* save registers according to calling convention*/
+        push    %ebx
+        push    %esi
+        push    %edi
+        mov     tfm + 16(%esp), %ebp    /* abuse the base pointer: set new base bointer to the crypto tfm */
+        add     $crypto_tfm_ctx_offset, %ebp    /* ctx adress */
+        mov     in_blk+16(%esp),%edi    /* input adress in edi */
+        mov     (%edi),         %eax
+        mov     b_offset(%edi), %ebx
+        mov     c_offset(%edi), %ecx
+        mov     d_offset(%edi), %edx
+        input_whitening(%eax,%ebp,a_offset)
+        ror     $16,    %eax
+        input_whitening(%ebx,%ebp,b_offset)
+        input_whitening(%ecx,%ebp,c_offset)
+        input_whitening(%edx,%ebp,d_offset)
+        rol     $1,     %edx
+        encrypt_round(R0,R1,R2,R3,0);
+        encrypt_round(R2,R3,R0,R1,8);
+        encrypt_round(R0,R1,R2,R3,2*8);
+        encrypt_round(R2,R3,R0,R1,3*8);
+        encrypt_round(R0,R1,R2,R3,4*8);
+        encrypt_round(R2,R3,R0,R1,5*8);
+        encrypt_round(R0,R1,R2,R3,6*8);
+        encrypt_round(R2,R3,R0,R1,7*8);
+        encrypt_round(R0,R1,R2,R3,8*8);
+        encrypt_round(R2,R3,R0,R1,9*8);
+        encrypt_round(R0,R1,R2,R3,10*8);
+        encrypt_round(R2,R3,R0,R1,11*8);
+        encrypt_round(R0,R1,R2,R3,12*8);
+        encrypt_round(R2,R3,R0,R1,13*8);
+        encrypt_round(R0,R1,R2,R3,14*8);
+        encrypt_last_round(R2,R3,R0,R1,15*8);
+        output_whitening(%eax,%ebp,c_offset)
+        output_whitening(%ebx,%ebp,d_offset)
+        output_whitening(%ecx,%ebp,a_offset)
+        output_whitening(%edx,%ebp,b_offset)
+        mov     out_blk+16(%esp),%edi;
+        mov     %eax,           c_offset(%edi)
+        mov     %ebx,           d_offset(%edi)
+        mov     %ecx,           (%edi)
+        mov     %edx,           b_offset(%edi)
+        pop     %edi
+        pop     %esi
+        pop     %ebx
+        pop     %ebp
+        mov     $1,     %eax
+        ret
+twofish_dec_blk:
+        push    %ebp                    /* save registers according to calling convention*/
+        push    %ebx
+        push    %esi
+        push    %edi
+        mov     tfm + 16(%esp), %ebp    /* abuse the base pointer: set new base bointer to the crypto tfm */
+        add     $crypto_tfm_ctx_offset, %ebp    /* ctx adress */
+        mov     in_blk+16(%esp),%edi    /* input adress in edi */
+        mov     (%edi),         %eax
+        mov     b_offset(%edi), %ebx
+        mov     c_offset(%edi), %ecx
+        mov     d_offset(%edi), %edx
+        output_whitening(%eax,%ebp,a_offset)
+        output_whitening(%ebx,%ebp,b_offset)
+        ror     $16,    %ebx
+        output_whitening(%ecx,%ebp,c_offset)
+        output_whitening(%edx,%ebp,d_offset)
+        rol     $1,     %ecx
+        decrypt_round(R0,R1,R2,R3,15*8);
+        decrypt_round(R2,R3,R0,R1,14*8);
+        decrypt_round(R0,R1,R2,R3,13*8);
+        decrypt_round(R2,R3,R0,R1,12*8);
+        decrypt_round(R0,R1,R2,R3,11*8);
+        decrypt_round(R2,R3,R0,R1,10*8);
+        decrypt_round(R0,R1,R2,R3,9*8);
+        decrypt_round(R2,R3,R0,R1,8*8);
+        decrypt_round(R0,R1,R2,R3,7*8);
+        decrypt_round(R2,R3,R0,R1,6*8);
+        decrypt_round(R0,R1,R2,R3,5*8);
+        decrypt_round(R2,R3,R0,R1,4*8);
+        decrypt_round(R0,R1,R2,R3,3*8);
+        decrypt_round(R2,R3,R0,R1,2*8);
+        decrypt_round(R0,R1,R2,R3,1*8);
+        decrypt_last_round(R2,R3,R0,R1,0);
+        input_whitening(%eax,%ebp,c_offset)
+        input_whitening(%ebx,%ebp,d_offset)
+        input_whitening(%ecx,%ebp,a_offset)
+        input_whitening(%edx,%ebp,b_offset)
+        mov     out_blk+16(%esp),%edi;
+        mov     %eax,           c_offset(%edi)
+        mov     %ebx,           d_offset(%edi)
+        mov     %ecx,           (%edi)
+        mov     %edx,           b_offset(%edi)
+        pop     %edi
+        pop     %esi
+        pop     %ebx
+        pop     %ebp
+        mov     $1,     %eax
+        ret
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
new file mode 100644
index 000000000000..35974a586615
--- /dev/null
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -0,0 +1,324 @@
+/***************************************************************************
+*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+***************************************************************************/
+.file "twofish-x86_64-asm.S"
+.text
+#include <asm/asm-offsets.h>
+#define a_offset        0
+#define b_offset        4
+#define c_offset        8
+#define d_offset        12
+/* Structure of the crypto context struct*/
+#define s0      0       /* S0 Array 256 Words each */
+#define s1      1024    /* S1 Array */
+#define s2      2048    /* S2 Array */
+#define s3      3072    /* S3 Array */
+#define w       4096    /* 8 whitening keys (word) */
+#define k       4128    /* key 1-32 ( word ) */
+/* define a few register aliases to allow macro substitution */
+#define R0     %rax
+#define R0D    %eax
+#define R0B    %al
+#define R0H    %ah
+#define R1     %rbx
+#define R1D    %ebx
+#define R1B    %bl
+#define R1H    %bh
+#define R2     %rcx
+#define R2D    %ecx
+#define R2B    %cl
+#define R2H    %ch
+#define R3     %rdx
+#define R3D    %edx
+#define R3B    %dl
+#define R3H    %dh
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+        xor     w+offset(context),      src;
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+        xor     w+16+offset(context),   src;
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define encrypt_round(a,b,c,d,round)\
+        movzx   b ## B,         %edi;\
+        mov     s1(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        mov     s2(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     s2(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s3(%r11,%rdi,4),%r9d;\
+        movzx   b ## B,         %edi;\
+        xor     s3(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        xor     (%r11,%rdi,4),  %r9d;\
+        movzx   b ## H,         %edi;\
+        ror     $15,            b ## D;\
+        xor     (%r11,%rdi,4),  %r8d;\
+        movzx   a ## H,         %edi;\
+        xor     s1(%r11,%rdi,4),%r9d;\
+        add     %r8d,           %r9d;\
+        add     %r9d,           %r8d;\
+        add     k+round(%r11),  %r9d;\
+        xor     %r9d,           c ## D;\
+        rol     $15,            c ## D;\
+        add     k+4+round(%r11),%r8d;\
+        xor     %r8d,           d ## D;
+/*
+ * a input register containing a(rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * during the round a and b are prepared for the output whitening
+ */
+#define encrypt_last_round(a,b,c,d,round)\
+        mov     b ## D,         %r10d;\
+        shl     $32,            %r10;\
+        movzx   b ## B,         %edi;\
+        mov     s1(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        mov     s2(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     s2(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s3(%r11,%rdi,4),%r9d;\
+        movzx   b ## B,         %edi;\
+        xor     s3(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        xor     (%r11,%rdi,4),  %r9d;\
+        xor     a,              %r10;\
+        movzx   b ## H,         %edi;\
+        xor     (%r11,%rdi,4),  %r8d;\
+        movzx   a ## H,         %edi;\
+        xor     s1(%r11,%rdi,4),%r9d;\
+        add     %r8d,           %r9d;\
+        add     %r9d,           %r8d;\
+        add     k+round(%r11),  %r9d;\
+        xor     %r9d,           c ## D;\
+        ror     $1,             c ## D;\
+        add     k+4+round(%r11),%r8d;\
+        xor     %r8d,           d ## D
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c (already rol $1)
+ * d input register containing d
+ * operations on a and b are interleaved to increase performance
+ */
+#define decrypt_round(a,b,c,d,round)\
+        movzx   a ## B,         %edi;\
+        mov     (%r11,%rdi,4),  %r9d;\
+        movzx   b ## B,         %edi;\
+        mov     s3(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        ror     $16,            a ## D;\
+        xor     s1(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     (%r11,%rdi,4),  %r8d;\
+        movzx   a ## B,         %edi;\
+        xor     s2(%r11,%rdi,4),%r9d;\
+        movzx   b ## B,         %edi;\
+        xor     s1(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        ror     $15,            a ## D;\
+        xor     s3(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        xor     s2(%r11,%rdi,4),%r8d;\
+        add     %r8d,           %r9d;\
+        add     %r9d,           %r8d;\
+        add     k+round(%r11),  %r9d;\
+        xor     %r9d,           c ## D;\
+        add     k+4+round(%r11),%r8d;\
+        xor     %r8d,           d ## D;\
+        rol     $15,            d ## D;
+/*
+ * a input register containing a
+ * b input register containing b
+ * c input register containing c (already rol $1)
+ * d input register containing d
+ * operations on a and b are interleaved to increase performance
+ * during the round a and b are prepared for the output whitening
+ */
+#define decrypt_last_round(a,b,c,d,round)\
+        movzx   a ## B,         %edi;\
+        mov     (%r11,%rdi,4),  %r9d;\
+        movzx   b ## B,         %edi;\
+        mov     s3(%r11,%rdi,4),%r8d;\
+        movzx   b ## H,         %edi;\
+        ror     $16,            b ## D;\
+        xor     (%r11,%rdi,4),  %r8d;\
+        movzx   a ## H,         %edi;\
+        mov     b ## D,         %r10d;\
+        shl     $32,            %r10;\
+        xor     a,              %r10;\
+        ror     $16,            a ## D;\
+        xor     s1(%r11,%rdi,4),%r9d;\
+        movzx   b ## B,         %edi;\
+        xor     s1(%r11,%rdi,4),%r8d;\
+        movzx   a ## B,         %edi;\
+        xor     s2(%r11,%rdi,4),%r9d;\
+        movzx   b ## H,         %edi;\
+        xor     s2(%r11,%rdi,4),%r8d;\
+        movzx   a ## H,         %edi;\
+        xor     s3(%r11,%rdi,4),%r9d;\
+        add     %r8d,           %r9d;\
+        add     %r9d,           %r8d;\
+        add     k+round(%r11),  %r9d;\
+        xor     %r9d,           c ## D;\
+        add     k+4+round(%r11),%r8d;\
+        xor     %r8d,           d ## D;\
+        ror     $1,             d ## D;
+.align 8
+.global twofish_enc_blk
+.global twofish_dec_blk
+twofish_enc_blk:
+        pushq    R1
+        /* %rdi contains the crypto tfm adress */
+        /* %rsi contains the output adress */
+        /* %rdx contains the input adress */
+        add     $crypto_tfm_ctx_offset, %rdi    /* set ctx adress */
+        /* ctx adress is moved to free one non-rex register
+        as target for the 8bit high operations */
+        mov     %rdi,           %r11
+        movq    (R3),   R1
+        movq    8(R3),  R3
+        input_whitening(R1,%r11,a_offset)
+        input_whitening(R3,%r11,c_offset)
+        mov     R1D,    R0D
+        rol     $16,    R0D
+        shr     $32,    R1
+        mov     R3D,    R2D
+        shr     $32,    R3
+        rol     $1,     R3D
+        encrypt_round(R0,R1,R2,R3,0);
+        encrypt_round(R2,R3,R0,R1,8);
+        encrypt_round(R0,R1,R2,R3,2*8);
+        encrypt_round(R2,R3,R0,R1,3*8);
+        encrypt_round(R0,R1,R2,R3,4*8);
+        encrypt_round(R2,R3,R0,R1,5*8);
+        encrypt_round(R0,R1,R2,R3,6*8);
+        encrypt_round(R2,R3,R0,R1,7*8);
+        encrypt_round(R0,R1,R2,R3,8*8);
+        encrypt_round(R2,R3,R0,R1,9*8);
+        encrypt_round(R0,R1,R2,R3,10*8);
+        encrypt_round(R2,R3,R0,R1,11*8);
+        encrypt_round(R0,R1,R2,R3,12*8);
+        encrypt_round(R2,R3,R0,R1,13*8);
+        encrypt_round(R0,R1,R2,R3,14*8);
+        encrypt_last_round(R2,R3,R0,R1,15*8);
+        output_whitening(%r10,%r11,a_offset)
+        movq    %r10,   (%rsi)
+        shl     $32,    R1
+        xor     R0,     R1
+        output_whitening(R1,%r11,c_offset)
+        movq    R1,     8(%rsi)
+        popq    R1
+        movq    $1,%rax
+        ret
+twofish_dec_blk:
+        pushq    R1
+        /* %rdi contains the crypto tfm adress */
+        /* %rsi contains the output adress */
+        /* %rdx contains the input adress */
+        add     $crypto_tfm_ctx_offset, %rdi    /* set ctx adress */
+        /* ctx adress is moved to free one non-rex register
+        as target for the 8bit high operations */
+        mov     %rdi,           %r11
+        movq    (R3),   R1
+        movq    8(R3),  R3
+        output_whitening(R1,%r11,a_offset)
+        output_whitening(R3,%r11,c_offset)
+        mov     R1D,    R0D
+        shr     $32,    R1
+        rol     $16,    R1D
+        mov     R3D,    R2D
+        shr     $32,    R3
+        rol     $1,     R2D
+        decrypt_round(R0,R1,R2,R3,15*8);
+        decrypt_round(R2,R3,R0,R1,14*8);
+        decrypt_round(R0,R1,R2,R3,13*8);
+        decrypt_round(R2,R3,R0,R1,12*8);
+        decrypt_round(R0,R1,R2,R3,11*8);
+        decrypt_round(R2,R3,R0,R1,10*8);
+        decrypt_round(R0,R1,R2,R3,9*8);
+        decrypt_round(R2,R3,R0,R1,8*8);
+        decrypt_round(R0,R1,R2,R3,7*8);
+        decrypt_round(R2,R3,R0,R1,6*8);
+        decrypt_round(R0,R1,R2,R3,5*8);
+        decrypt_round(R2,R3,R0,R1,4*8);
+        decrypt_round(R0,R1,R2,R3,3*8);
+        decrypt_round(R2,R3,R0,R1,2*8);
+        decrypt_round(R0,R1,R2,R3,1*8);
+        decrypt_last_round(R2,R3,R0,R1,0);
+        input_whitening(%r10,%r11,a_offset)
+        movq    %r10,   (%rsi)
+        shl     $32,    R1
+        xor     R0,     R1
+        input_whitening(R1,%r11,c_offset)
+        movq    R1,     8(%rsi)
+        popq    R1
+        movq    $1,%rax
+        ret
diff --git a/arch/x86/crypto/twofish_32.c b/arch/x86/crypto/twofish_32.c
new file mode 100644
index 000000000000..e3004dfe9c7a
--- /dev/null
+++ b/arch/x86/crypto/twofish_32.c
@@ -0,0 +1,97 @@
+/*
+ *  Glue Code for optimized 586 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+#include <crypto/twofish.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+        twofish_enc_blk(tfm, dst, src);
+}
+static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+        twofish_dec_blk(tfm, dst, src);
+}
+static struct crypto_alg alg = {
+        .cra_name               =       "twofish",
+        .cra_driver_name        =       "twofish-i586",
+        .cra_priority           =       200,
+        .cra_flags              =       CRYPTO_ALG_TYPE_CIPHER,
+        .cra_blocksize          =       TF_BLOCK_SIZE,
+        .cra_ctxsize            =       sizeof(struct twofish_ctx),
+        .cra_alignmask          =       3,
+        .cra_module             =       THIS_MODULE,
+        .cra_list               =       LIST_HEAD_INIT(alg.cra_list),
+        .cra_u                  =       {
+                .cipher = {
+                        .cia_min_keysize        =       TF_MIN_KEY_SIZE,
+                        .cia_max_keysize        =       TF_MAX_KEY_SIZE,
+                        .cia_setkey             =       twofish_setkey,
+                        .cia_encrypt            =       twofish_encrypt,
+                        .cia_decrypt            =       twofish_decrypt
+                }
+        }
+};
+static int __init init(void)
+{
+        return crypto_register_alg(&alg);
+}
+static void __exit fini(void)
+{
+        crypto_unregister_alg(&alg);
+}
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, i586 asm optimized");
+MODULE_ALIAS("twofish");
diff --git a/arch/x86/crypto/twofish_64.c b/arch/x86/crypto/twofish_64.c
new file mode 100644
index 000000000000..182d91d5cfb9
--- /dev/null
+++ b/arch/x86/crypto/twofish_64.c
@@ -0,0 +1,97 @@
+/*
+ * Glue Code for optimized x86_64 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+#include <crypto/twofish.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+        twofish_enc_blk(tfm, dst, src);
+}
+static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+        twofish_dec_blk(tfm, dst, src);
+}
+static struct crypto_alg alg = {
+        .cra_name               =       "twofish",
+        .cra_driver_name        =       "twofish-x86_64",
+        .cra_priority           =       200,
+        .cra_flags              =       CRYPTO_ALG_TYPE_CIPHER,
+        .cra_blocksize          =       TF_BLOCK_SIZE,
+        .cra_ctxsize            =       sizeof(struct twofish_ctx),
+        .cra_alignmask          =       3,
+        .cra_module             =       THIS_MODULE,
+        .cra_list               =       LIST_HEAD_INIT(alg.cra_list),
+        .cra_u                  =       {
+                .cipher = {
+                        .cia_min_keysize        =       TF_MIN_KEY_SIZE,
+                        .cia_max_keysize        =       TF_MAX_KEY_SIZE,
+                        .cia_setkey             =       twofish_setkey,
+                        .cia_encrypt            =       twofish_encrypt,
+                        .cia_decrypt            =       twofish_decrypt
+                }
+        }
+};
+static int __init init(void)
+{
+        return crypto_register_alg(&alg);
+}
+static void __exit fini(void)
+{
+        crypto_unregister_alg(&alg);
+}
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
+MODULE_ALIAS("twofish");
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
new file mode 100644
index 000000000000..cdae36435e21
--- /dev/null
+++ b/arch/x86/ia32/Makefile
@@ -0,0 +1,35 @@
+#
+# Makefile for the ia32 kernel emulation subsystem.
+#
+obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o tls32.o \
+        ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o \
+        mmap32.o
+sysv-$(CONFIG_SYSVIPC) := ipc32.o
+obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
+obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
+audit-class-$(CONFIG_AUDIT) := audit.o
+obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y)
+$(obj)/syscall32_syscall.o: \
+        $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so)
+# Teach kbuild about targets
+targets := $(foreach F,sysenter syscall,vsyscall-$F.o vsyscall-$F.so)
+# The DSO images are built using a special linker script
+quiet_cmd_syscall = SYSCALL $@
+      cmd_syscall = $(CC) -m32 -nostdlib -shared -s \
+                          $(call ld-option, -Wl$(comma)--hash-style=sysv) \
+                           -Wl,-soname=linux-gate.so.1 -o $@ \
+                           -Wl,-T,$(filter-out FORCE,$^)
+$(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
+$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
+        $(call if_changed,syscall)
+AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
+AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
new file mode 100644
index 000000000000..91b7b5922dfa
--- /dev/null
+++ b/arch/x86/ia32/audit.c
@@ -0,0 +1,42 @@
+#include <asm/unistd_32.h>
+unsigned ia32_dir_class[] = {
+#include <asm-generic/audit_dir_write.h>
+~0U
+};
+unsigned ia32_chattr_class[] = {
+#include <asm-generic/audit_change_attr.h>
+~0U
+};
+unsigned ia32_write_class[] = {
+#include <asm-generic/audit_write.h>
+~0U
+};
+unsigned ia32_read_class[] = {
+#include <asm-generic/audit_read.h>
+~0U
+};
+unsigned ia32_signal_class[] = {
+#include <asm-generic/audit_signal.h>
+~0U
+};
+int ia32_classify_syscall(unsigned syscall)
+{
+        switch(syscall) {
+        case __NR_open:
+                return 2;
+        case __NR_openat:
+                return 3;
+        case __NR_socketcall:
+                return 4;
+        case __NR_execve:
+                return 5;
+        default:
+                return 1;
+        }
+}
diff --git a/arch/x86/ia32/fpu32.c b/arch/x86/ia32/fpu32.c
new file mode 100644
index 000000000000..2c8209a3605a
--- /dev/null
+++ b/arch/x86/ia32/fpu32.c
@@ -0,0 +1,183 @@
+/* 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes.
+ * This is used for ptrace, signals and coredumps in 32bit emulation.
+ */ 
+#include <linux/sched.h>
+#include <asm/sigcontext32.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+        unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+ 
+        /* Transform each pair of bits into 01 (valid) or 00 (empty) */
+        tmp = ~twd;
+        tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+        /* and move the valid bits to the lower byte. */
+        tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+        tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+        tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+        return tmp;
+}
+static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
+{
+        struct _fpxreg *st = NULL;
+        unsigned long tos = (fxsave->swd >> 11) & 7;
+        unsigned long twd = (unsigned long) fxsave->twd;
+        unsigned long tag;
+        unsigned long ret = 0xffff0000;
+        int i;
+#define FPREG_ADDR(f, n)        ((void *)&(f)->st_space + (n) * 16);
+        for (i = 0 ; i < 8 ; i++) {
+                if (twd & 0x1) {
+                        st = FPREG_ADDR( fxsave, (i - tos) & 7 );
+                        switch (st->exponent & 0x7fff) {
+                        case 0x7fff:
+                                tag = 2;                /* Special */
+                                break;
+                        case 0x0000:
+                                if ( !st->significand[0] &&
+                                     !st->significand[1] &&
+                                     !st->significand[2] &&
+                                     !st->significand[3] ) {
+                                        tag = 1;        /* Zero */
+                                } else {
+                                        tag = 2;        /* Special */
+                                }
+                                break;
+                        default:
+                                if (st->significand[3] & 0x8000) {
+                                        tag = 0;        /* Valid */
+                                } else {
+                                        tag = 2;        /* Special */
+                                }
+                                break;
+                        }
+                } else {
+                        tag = 3;                        /* Empty */
+                }
+                ret |= (tag << (2 * i));
+                twd = twd >> 1;
+        }
+        return ret;
+}
+static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave,
+                                         struct _fpstate_ia32 __user *buf)
+{
+        struct _fpxreg *to;
+        struct _fpreg __user *from;
+        int i;
+        u32 v;
+        int err = 0;
+#define G(num,val) err |= __get_user(val, num + (u32 __user *)buf)
+        G(0, fxsave->cwd);
+        G(1, fxsave->swd);
+        G(2, fxsave->twd);
+        fxsave->twd = twd_i387_to_fxsr(fxsave->twd);
+        G(3, fxsave->rip);
+        G(4, v);
+        fxsave->fop = v>>16;    /* cs ignored */
+        G(5, fxsave->rdp);
+        /* 6: ds ignored */
+#undef G
+        if (err) 
+                return -1; 
+        to = (struct _fpxreg *)&fxsave->st_space[0];
+        from = &buf->_st[0];
+        for (i = 0 ; i < 8 ; i++, to++, from++) {
+                if (__copy_from_user(to, from, sizeof(*from)))
+                        return -1;
+        }
+        return 0;
+}
+static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf,
+                                       struct i387_fxsave_struct *fxsave,
+                                       struct pt_regs *regs,
+                                       struct task_struct *tsk)
+{
+        struct _fpreg __user *to;
+        struct _fpxreg *from;
+        int i;
+        u16 cs,ds; 
+        int err = 0; 
+        if (tsk == current) {
+                /* should be actually ds/cs at fpu exception time,
+                   but that information is not available in 64bit mode. */
+                asm("movw %%ds,%0 " : "=r" (ds)); 
+                asm("movw %%cs,%0 " : "=r" (cs));               
+        } else { /* ptrace. task has stopped. */
+                ds = tsk->thread.ds;
+                cs = regs->cs;
+        } 
+#define P(num,val) err |= __put_user(val, num + (u32 __user *)buf)
+        P(0, (u32)fxsave->cwd | 0xffff0000);
+        P(1, (u32)fxsave->swd | 0xffff0000);
+        P(2, twd_fxsr_to_i387(fxsave));
+        P(3, (u32)fxsave->rip);
+        P(4,  cs | ((u32)fxsave->fop) << 16); 
+        P(5, fxsave->rdp);
+        P(6, 0xffff0000 | ds);
+#undef P
+        if (err) 
+                return -1; 
+        to = &buf->_st[0];
+        from = (struct _fpxreg *) &fxsave->st_space[0];
+        for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
+                if (__copy_to_user(to, from, sizeof(*to)))
+                        return -1;
+        }
+        return 0;
+}
+int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave) 
+{ 
+        clear_fpu(tsk);
+        if (!fsave) { 
+                if (__copy_from_user(&tsk->thread.i387.fxsave, 
+                                     &buf->_fxsr_env[0],
+                                     sizeof(struct i387_fxsave_struct)))
+                        return -1;
+                tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+                set_stopped_child_used_math(tsk);
+        } 
+        return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
+}  
+int save_i387_ia32(struct task_struct *tsk, 
+                   struct _fpstate_ia32 __user *buf, 
+                   struct pt_regs *regs,
+                   int fsave)
+{
+        int err = 0;
+        init_fpu(tsk);
+        if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk))
+                return -1;
+        if (fsave)
+                return 0;
+        err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
+        if (fsave) 
+                return err ? -1 : 1;    
+        err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
+        err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
+                              sizeof(struct i387_fxsave_struct));
+        return err ? -1 : 1;
+}
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
new file mode 100644
index 000000000000..08781370256d
--- /dev/null
+++ b/arch/x86/ia32/ia32_aout.c
@@ -0,0 +1,528 @@
+/*
+ *  a.out loader for x86-64
+ *
+ *  Copyright (C) 1991, 1992, 1996  Linus Torvalds
+ *  Hacked together by Andi Kleen
+ */
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/a.out.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/slab.h>
+#include <linux/binfmts.h>
+#include <linux/personality.h>
+#include <linux/init.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/user32.h>
+#include <asm/ia32.h>
+#undef WARN_OLD
+#undef CORE_DUMP /* probably broken */
+static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_aout_library(struct file*);
+#ifdef CORE_DUMP
+static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file);
+/*
+ * fill in the user structure for a core dump..
+ */
+static void dump_thread32(struct pt_regs * regs, struct user32 * dump)
+{
+        u32 fs,gs;
+/* changed the size calculations - should hopefully work better. lbt */
+        dump->magic = CMAGIC;
+        dump->start_code = 0;
+        dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1);
+        dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
+        dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
+        dump->u_dsize -= dump->u_tsize;
+        dump->u_ssize = 0;
+        dump->u_debugreg[0] = current->thread.debugreg0;  
+        dump->u_debugreg[1] = current->thread.debugreg1;  
+        dump->u_debugreg[2] = current->thread.debugreg2;  
+        dump->u_debugreg[3] = current->thread.debugreg3;  
+        dump->u_debugreg[4] = 0;  
+        dump->u_debugreg[5] = 0;  
+        dump->u_debugreg[6] = current->thread.debugreg6;  
+        dump->u_debugreg[7] = current->thread.debugreg7;  
+        if (dump->start_stack < 0xc0000000)
+                dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT;
+        dump->regs.ebx = regs->rbx;
+        dump->regs.ecx = regs->rcx;
+        dump->regs.edx = regs->rdx;
+        dump->regs.esi = regs->rsi;
+        dump->regs.edi = regs->rdi;
+        dump->regs.ebp = regs->rbp;
+        dump->regs.eax = regs->rax;
+        dump->regs.ds = current->thread.ds;
+        dump->regs.es = current->thread.es;
+        asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
+        asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; 
+        dump->regs.orig_eax = regs->orig_rax;
+        dump->regs.eip = regs->rip;
+        dump->regs.cs = regs->cs;
+        dump->regs.eflags = regs->eflags;
+        dump->regs.esp = regs->rsp;
+        dump->regs.ss = regs->ss;
+#if 1 /* FIXME */
+        dump->u_fpvalid = 0;
+#else
+        dump->u_fpvalid = dump_fpu (regs, &dump->i387);
+#endif
+}
+#endif
+static struct linux_binfmt aout_format = {
+        .module         = THIS_MODULE,
+        .load_binary    = load_aout_binary,
+        .load_shlib     = load_aout_library,
+#ifdef CORE_DUMP
+        .core_dump      = aout_core_dump,
+#endif
+        .min_coredump   = PAGE_SIZE
+};
+static void set_brk(unsigned long start, unsigned long end)
+{
+        start = PAGE_ALIGN(start);
+        end = PAGE_ALIGN(end);
+        if (end <= start)
+                return;
+        down_write(&current->mm->mmap_sem);
+        do_brk(start, end - start);
+        up_write(&current->mm->mmap_sem);
+}
+#ifdef CORE_DUMP
+/*
+ * These are the only things you should do on a core-file: use only these
+ * macros to write out all the necessary info.
+ */
+static int dump_write(struct file *file, const void *addr, int nr)
+{
+        return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+#define DUMP_WRITE(addr, nr)    \
+        if (!dump_write(file, (void *)(addr), (nr))) \
+                goto end_coredump;
+#define DUMP_SEEK(offset) \
+if (file->f_op->llseek) { \
+        if (file->f_op->llseek(file,(offset),0) != (offset)) \
+                goto end_coredump; \
+} else file->f_pos = (offset)
+/*
+ * Routine writes a core dump image in the current directory.
+ * Currently only a stub-function.
+ *
+ * Note that setuid/setgid files won't make a core-dump if the uid/gid
+ * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
+ * field, which also makes sure the core-dumps won't be recursive if the
+ * dumping of the process results in another error..
+ */
+static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file)
+{
+        mm_segment_t fs;
+        int has_dumped = 0;
+        unsigned long dump_start, dump_size;
+        struct user32 dump;
+#       define START_DATA(u)    (u.u_tsize << PAGE_SHIFT)
+#       define START_STACK(u)   (u.start_stack)
+        fs = get_fs();
+        set_fs(KERNEL_DS);
+        has_dumped = 1;
+        current->flags |= PF_DUMPCORE;
+        strncpy(dump.u_comm, current->comm, sizeof(current->comm));
+        dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump)));
+        dump.signal = signr;
+        dump_thread32(regs, &dump);
+/* If the size of the dump file exceeds the rlimit, then see what would happen
+   if we wrote the stack, but not the data area.  */
+        if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE >
+            current->signal->rlim[RLIMIT_CORE].rlim_cur)
+                dump.u_dsize = 0;
+/* Make sure we have enough room to write the stack and data areas. */
+        if ((dump.u_ssize+1) * PAGE_SIZE >
+            current->signal->rlim[RLIMIT_CORE].rlim_cur)
+                dump.u_ssize = 0;
+/* make sure we actually have a data and stack area to dump */
+        set_fs(USER_DS);
+        if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+                dump.u_dsize = 0;
+        if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+                dump.u_ssize = 0;
+        set_fs(KERNEL_DS);
+/* struct user */
+        DUMP_WRITE(&dump,sizeof(dump));
+/* Now dump all of the user data.  Include malloced stuff as well */
+        DUMP_SEEK(PAGE_SIZE);
+/* now we start writing out the user space info */
+        set_fs(USER_DS);
+/* Dump the data area */
+        if (dump.u_dsize != 0) {
+                dump_start = START_DATA(dump);
+                dump_size = dump.u_dsize << PAGE_SHIFT;
+                DUMP_WRITE(dump_start,dump_size);
+        }
+/* Now prepare to dump the stack area */
+        if (dump.u_ssize != 0) {
+                dump_start = START_STACK(dump);
+                dump_size = dump.u_ssize << PAGE_SHIFT;
+                DUMP_WRITE(dump_start,dump_size);
+        }
+/* Finally dump the task struct.  Not be used by gdb, but could be useful */
+        set_fs(KERNEL_DS);
+        DUMP_WRITE(current,sizeof(*current));
+end_coredump:
+        set_fs(fs);
+        return has_dumped;
+}
+#endif
+/*
+ * create_aout_tables() parses the env- and arg-strings in new user
+ * memory and creates the pointer tables from them, and puts their
+ * addresses on the "stack", returning the new stack pointer value.
+ */
+static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
+{
+        u32 __user *argv;
+        u32 __user *envp;
+        u32 __user *sp;
+        int argc = bprm->argc;
+        int envc = bprm->envc;
+        sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p);
+        sp -= envc+1;
+        envp = sp;
+        sp -= argc+1;
+        argv = sp;
+        put_user((unsigned long) envp,--sp);
+        put_user((unsigned long) argv,--sp);
+        put_user(argc,--sp);
+        current->mm->arg_start = (unsigned long) p;
+        while (argc-->0) {
+                char c;
+                put_user((u32)(unsigned long)p,argv++);
+                do {
+                        get_user(c,p++);
+                } while (c);
+        }
+        put_user(0, argv);
+        current->mm->arg_end = current->mm->env_start = (unsigned long) p;
+        while (envc-->0) {
+                char c;
+                put_user((u32)(unsigned long)p,envp++);
+                do {
+                        get_user(c,p++);
+                } while (c);
+        }
+        put_user(0, envp);
+        current->mm->env_end = (unsigned long) p;
+        return sp;
+}
+/*
+ * These are the functions used to load a.out style executables and shared
+ * libraries.  There is no binary dependent code anywhere else.
+ */
+static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+{
+        struct exec ex;
+        unsigned long error;
+        unsigned long fd_offset;
+        unsigned long rlim;
+        int retval;
+        ex = *((struct exec *) bprm->buf);              /* exec-header */
+        if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
+             N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
+            N_TRSIZE(ex) || N_DRSIZE(ex) ||
+            i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+                return -ENOEXEC;
+        }
+        fd_offset = N_TXTOFF(ex);
+        /* Check initial limits. This avoids letting people circumvent
+         * size limits imposed on them by creating programs with large
+         * arrays in the data or bss.
+         */
+        rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+        if (rlim >= RLIM_INFINITY)
+                rlim = ~0;
+        if (ex.a_data + ex.a_bss > rlim)
+                return -ENOMEM;
+        /* Flush all traces of the currently running executable */
+        retval = flush_old_exec(bprm);
+        if (retval)
+                return retval;
+        regs->cs = __USER32_CS; 
+        regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
+                regs->r13 = regs->r14 = regs->r15 = 0;
+        /* OK, This is the point of no return */
+        set_personality(PER_LINUX);
+        set_thread_flag(TIF_IA32); 
+        clear_thread_flag(TIF_ABI_PENDING);
+        current->mm->end_code = ex.a_text +
+                (current->mm->start_code = N_TXTADDR(ex));
+        current->mm->end_data = ex.a_data +
+                (current->mm->start_data = N_DATADDR(ex));
+        current->mm->brk = ex.a_bss +
+                (current->mm->start_brk = N_BSSADDR(ex));
+        current->mm->free_area_cache = TASK_UNMAPPED_BASE;
+        current->mm->cached_hole_size = 0;
+        current->mm->mmap = NULL;
+        compute_creds(bprm);
+        current->flags &= ~PF_FORKNOEXEC;
+        if (N_MAGIC(ex) == OMAGIC) {
+                unsigned long text_addr, map_size;
+                loff_t pos;
+                text_addr = N_TXTADDR(ex);
+                pos = 32;
+                map_size = ex.a_text+ex.a_data;
+                down_write(&current->mm->mmap_sem);
+                error = do_brk(text_addr & PAGE_MASK, map_size);
+                up_write(&current->mm->mmap_sem);
+                if (error != (text_addr & PAGE_MASK)) {
+                        send_sig(SIGKILL, current, 0);
+                        return error;
+                }
+                error = bprm->file->f_op->read(bprm->file,
+                         (char __user *)text_addr,
+                          ex.a_text+ex.a_data, &pos);
+                if ((signed long)error < 0) {
+                        send_sig(SIGKILL, current, 0);
+                        return error;
+                }
+                         
+                flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
+        } else {
+#ifdef WARN_OLD
+                static unsigned long error_time, error_time2;
+                if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
+                    (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ)
+                {
+                        printk(KERN_NOTICE "executable not page aligned\n");
+                        error_time2 = jiffies;
+                }
+                if ((fd_offset & ~PAGE_MASK) != 0 &&
+                    (jiffies-error_time) > 5*HZ)
+                {
+                        printk(KERN_WARNING 
+                               "fd_offset is not page aligned. Please convert program: %s\n",
+                               bprm->file->f_path.dentry->d_name.name);
+                        error_time = jiffies;
+                }
+#endif
+                if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
+                        loff_t pos = fd_offset;
+                        down_write(&current->mm->mmap_sem);
+                        do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
+                        up_write(&current->mm->mmap_sem);
+                        bprm->file->f_op->read(bprm->file,
+                                        (char __user *)N_TXTADDR(ex),
+                                        ex.a_text+ex.a_data, &pos);
+                        flush_icache_range((unsigned long) N_TXTADDR(ex),
+                                           (unsigned long) N_TXTADDR(ex) +
+                                           ex.a_text+ex.a_data);
+                        goto beyond_if;
+                }
+                down_write(&current->mm->mmap_sem);
+                error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
+                        PROT_READ | PROT_EXEC,
+                        MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT,
+                        fd_offset);
+                up_write(&current->mm->mmap_sem);
+                if (error != N_TXTADDR(ex)) {
+                        send_sig(SIGKILL, current, 0);
+                        return error;
+                }
+                down_write(&current->mm->mmap_sem);
+                error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
+                                PROT_READ | PROT_WRITE | PROT_EXEC,
+                                MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT,
+                                fd_offset + ex.a_text);
+                up_write(&current->mm->mmap_sem);
+                if (error != N_DATADDR(ex)) {
+                        send_sig(SIGKILL, current, 0);
+                        return error;
+                }
+        }
+beyond_if:
+        set_binfmt(&aout_format);
+        set_brk(current->mm->start_brk, current->mm->brk);
+        retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
+        if (retval < 0) { 
+                /* Someone check-me: is this error path enough? */ 
+                send_sig(SIGKILL, current, 0); 
+                return retval;
+        }
+        current->mm->start_stack =
+                (unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
+        /* start thread */
+        asm volatile("movl %0,%%fs" :: "r" (0)); \
+        asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
+        load_gs_index(0); 
+        (regs)->rip = ex.a_entry;
+        (regs)->rsp = current->mm->start_stack;
+        (regs)->eflags = 0x200;
+        (regs)->cs = __USER32_CS;
+        (regs)->ss = __USER32_DS;
+        set_fs(USER_DS);
+        if (unlikely(current->ptrace & PT_PTRACED)) {
+                if (current->ptrace & PT_TRACE_EXEC)
+                        ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
+                else
+                        send_sig(SIGTRAP, current, 0);
+        }
+        return 0;
+}
+static int load_aout_library(struct file *file)
+{
+        struct inode * inode;
+        unsigned long bss, start_addr, len;
+        unsigned long error;
+        int retval;
+        struct exec ex;
+        inode = file->f_path.dentry->d_inode;
+        retval = -ENOEXEC;
+        error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
+        if (error != sizeof(ex))
+                goto out;
+        /* We come in here for the regular a.out style of shared libraries */
+        if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
+            N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
+            i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+                goto out;
+        }
+        if (N_FLAGS(ex))
+                goto out;
+        /* For  QMAGIC, the starting address is 0x20 into the page.  We mask
+           this off to get the starting address for the page */
+        start_addr =  ex.a_entry & 0xfffff000;
+        if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
+                loff_t pos = N_TXTOFF(ex);
+#ifdef WARN_OLD
+                static unsigned long error_time;
+                if ((jiffies-error_time) > 5*HZ)
+                {
+                        printk(KERN_WARNING 
+                               "N_TXTOFF is not page aligned. Please convert library: %s\n",
+                               file->f_path.dentry->d_name.name);
+                        error_time = jiffies;
+                }
+#endif
+                down_write(&current->mm->mmap_sem);
+                do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
+                up_write(&current->mm->mmap_sem);
+                
+                file->f_op->read(file, (char __user *)start_addr,
+                        ex.a_text + ex.a_data, &pos);
+                flush_icache_range((unsigned long) start_addr,
+                                   (unsigned long) start_addr + ex.a_text + ex.a_data);
+                retval = 0;
+                goto out;
+        }
+        /* Now use mmap to map the library into memory. */
+        down_write(&current->mm->mmap_sem);
+        error = do_mmap(file, start_addr, ex.a_text + ex.a_data,
+                        PROT_READ | PROT_WRITE | PROT_EXEC,
+                        MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT,
+                        N_TXTOFF(ex));
+        up_write(&current->mm->mmap_sem);
+        retval = error;
+        if (error != start_addr)
+                goto out;
+        len = PAGE_ALIGN(ex.a_text + ex.a_data);
+        bss = ex.a_text + ex.a_data + ex.a_bss;
+        if (bss > len) {
+                down_write(&current->mm->mmap_sem);
+                error = do_brk(start_addr + len, bss - len);
+                up_write(&current->mm->mmap_sem);
+                retval = error;
+                if (error != start_addr + len)
+                        goto out;
+        }
+        retval = 0;
+out:
+        return retval;
+}
+static int __init init_aout_binfmt(void)
+{
+        return register_binfmt(&aout_format);
+}
+static void __exit exit_aout_binfmt(void)
+{
+        unregister_binfmt(&aout_format);
+}
+module_init(init_aout_binfmt);
+module_exit(exit_aout_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/ia32/ia32_binfmt.c b/arch/x86/ia32/ia32_binfmt.c
new file mode 100644
index 000000000000..dffd2ac72747
--- /dev/null
+++ b/arch/x86/ia32/ia32_binfmt.c
@@ -0,0 +1,320 @@
+/* 
+ * Written 2000,2002 by Andi Kleen. 
+ * 
+ * Loosely based on the sparc64 and IA64 32bit emulation loaders.
+ * This tricks binfmt_elf.c into loading 32bit binaries using lots 
+ * of ugly preprocessor tricks. Talk about very very poor man's inheritance.
+ */ 
+#define __ASM_X86_64_ELF_H 1
+#undef ELF_CLASS
+#define ELF_CLASS ELFCLASS32
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/string.h>
+#include <linux/binfmts.h>
+#include <linux/mm.h>
+#include <linux/security.h>
+#include <asm/segment.h> 
+#include <asm/ptrace.h>
+#include <asm/processor.h>
+#include <asm/user32.h>
+#include <asm/sigcontext32.h>
+#include <asm/fpu32.h>
+#include <asm/i387.h>
+#include <asm/uaccess.h>
+#include <asm/ia32.h>
+#include <asm/vsyscall32.h>
+#define ELF_NAME "elf/i386"
+#define AT_SYSINFO 32
+#define AT_SYSINFO_EHDR         33
+int sysctl_vsyscall32 = 1;
+#undef ARCH_DLINFO
+#define ARCH_DLINFO do {  \
+        if (sysctl_vsyscall32) { \
+                current->mm->context.vdso = (void *)VSYSCALL32_BASE;    \
+                NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \
+                NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE);    \
+        }       \
+} while(0)
+struct file;
+struct elf_phdr; 
+#define IA32_EMULATOR 1
+#define ELF_ET_DYN_BASE         (TASK_UNMAPPED_BASE + 0x1000000)
+#undef ELF_ARCH
+#define ELF_ARCH EM_386
+#define ELF_DATA        ELFDATA2LSB
+#define USE_ELF_CORE_DUMP 1
+/* Override elfcore.h */ 
+#define _LINUX_ELFCORE_H 1
+typedef unsigned int elf_greg_t;
+#define ELF_NGREG (sizeof (struct user_regs_struct32) / sizeof(elf_greg_t))
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+struct elf_siginfo
+{
+        int     si_signo;                       /* signal number */
+        int     si_code;                        /* extra code */
+        int     si_errno;                       /* errno */
+};
+#define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0)
+struct elf_prstatus
+{
+        struct elf_siginfo pr_info;     /* Info associated with signal */
+        short   pr_cursig;              /* Current signal */
+        unsigned int pr_sigpend;        /* Set of pending signals */
+        unsigned int pr_sighold;        /* Set of held signals */
+        pid_t   pr_pid;
+        pid_t   pr_ppid;
+        pid_t   pr_pgrp;
+        pid_t   pr_sid;
+        struct compat_timeval pr_utime; /* User time */
+        struct compat_timeval pr_stime; /* System time */
+        struct compat_timeval pr_cutime;        /* Cumulative user time */
+        struct compat_timeval pr_cstime;        /* Cumulative system time */
+        elf_gregset_t pr_reg;   /* GP registers */
+        int pr_fpvalid;         /* True if math co-processor being used.  */
+};
+#define ELF_PRARGSZ     (80)    /* Number of chars for args */
+struct elf_prpsinfo
+{
+        char    pr_state;       /* numeric process state */
+        char    pr_sname;       /* char for pr_state */
+        char    pr_zomb;        /* zombie */
+        char    pr_nice;        /* nice val */
+        unsigned int pr_flag;   /* flags */
+        __u16   pr_uid;
+        __u16   pr_gid;
+        pid_t   pr_pid, pr_ppid, pr_pgrp, pr_sid;
+        /* Lots missing */
+        char    pr_fname[16];   /* filename of executable */
+        char    pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */
+};
+#define __STR(x) #x
+#define STR(x) __STR(x)
+#define _GET_SEG(x) \
+        ({ __u32 seg; asm("movl %%" STR(x) ",%0" : "=r"(seg)); seg; })
+/* Assumes current==process to be dumped */
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                \
+        pr_reg[0] = regs->rbx;                          \
+        pr_reg[1] = regs->rcx;                          \
+        pr_reg[2] = regs->rdx;                          \
+        pr_reg[3] = regs->rsi;                          \
+        pr_reg[4] = regs->rdi;                          \
+        pr_reg[5] = regs->rbp;                          \
+        pr_reg[6] = regs->rax;                          \
+        pr_reg[7] = _GET_SEG(ds);                       \
+        pr_reg[8] = _GET_SEG(es);                       \
+        pr_reg[9] = _GET_SEG(fs);                       \
+        pr_reg[10] = _GET_SEG(gs);                      \
+        pr_reg[11] = regs->orig_rax;                    \
+        pr_reg[12] = regs->rip;                         \
+        pr_reg[13] = regs->cs;                          \
+        pr_reg[14] = regs->eflags;                      \
+        pr_reg[15] = regs->rsp;                         \
+        pr_reg[16] = regs->ss;
+#define user user32
+#undef elf_read_implies_exec
+#define elf_read_implies_exec(ex, executable_stack)     (executable_stack != EXSTACK_DISABLE_X)
+//#include <asm/ia32.h>
+#include <linux/elf.h>
+typedef struct user_i387_ia32_struct elf_fpregset_t;
+typedef struct user32_fxsr_struct elf_fpxregset_t;
+static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs)
+{
+        ELF_CORE_COPY_REGS((*elfregs), regs)
+}
+static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs)
+{       
+        struct pt_regs *pp = task_pt_regs(t);
+        ELF_CORE_COPY_REGS((*elfregs), pp);
+        /* fix wrong segments */ 
+        (*elfregs)[7] = t->thread.ds; 
+        (*elfregs)[9] = t->thread.fsindex; 
+        (*elfregs)[10] = t->thread.gsindex; 
+        (*elfregs)[8] = t->thread.es;   
+        return 1; 
+}
+static inline int 
+elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpregset_t *fpu)
+{
+        struct _fpstate_ia32 *fpstate = (void*)fpu; 
+        mm_segment_t oldfs = get_fs();
+        if (!tsk_used_math(tsk))
+                return 0;
+        if (!regs)
+                regs = task_pt_regs(tsk);
+        if (tsk == current)
+                unlazy_fpu(tsk);
+        set_fs(KERNEL_DS); 
+        save_i387_ia32(tsk, fpstate, regs, 1);
+        /* Correct for i386 bug. It puts the fop into the upper 16bits of 
+           the tag word (like FXSAVE), not into the fcs*/ 
+        fpstate->cssel |= fpstate->tag & 0xffff0000; 
+        set_fs(oldfs); 
+        return 1; 
+}
+#define ELF_CORE_COPY_XFPREGS 1
+static inline int 
+elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
+{
+        struct pt_regs *regs = task_pt_regs(t);
+        if (!tsk_used_math(t))
+                return 0;
+        if (t == current)
+                unlazy_fpu(t); 
+        memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t));
+        xfpu->fcs = regs->cs; 
+        xfpu->fos = t->thread.ds; /* right? */ 
+        return 1;
+}
+#undef elf_check_arch
+#define elf_check_arch(x) \
+        ((x)->e_machine == EM_386)
+extern int force_personality32;
+#define ELF_EXEC_PAGESIZE PAGE_SIZE
+#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
+#define ELF_PLATFORM  ("i686")
+#define SET_PERSONALITY(ex, ibcs2)                      \
+do {                                                    \
+        unsigned long new_flags = 0;                            \
+        if ((ex).e_ident[EI_CLASS] == ELFCLASS32)               \
+                new_flags = _TIF_IA32;                          \
+        if ((current_thread_info()->flags & _TIF_IA32)          \
+            != new_flags)                                       \
+                set_thread_flag(TIF_ABI_PENDING);               \
+        else                                                    \
+                clear_thread_flag(TIF_ABI_PENDING);             \
+        /* XXX This overwrites the user set personality */      \
+        current->personality |= force_personality32;            \
+} while (0)
+/* Override some function names */
+#define elf_format                      elf32_format
+#define init_elf_binfmt                 init_elf32_binfmt
+#define exit_elf_binfmt                 exit_elf32_binfmt
+#define load_elf_binary load_elf32_binary
+#define ELF_PLAT_INIT(r, load_addr)     elf32_init(r)
+#undef start_thread
+#define start_thread(regs,new_rip,new_rsp) do { \
+        asm volatile("movl %0,%%fs" :: "r" (0)); \
+        asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \
+        load_gs_index(0); \
+        (regs)->rip = (new_rip); \
+        (regs)->rsp = (new_rsp); \
+        (regs)->eflags = 0x200; \
+        (regs)->cs = __USER32_CS; \
+        (regs)->ss = __USER32_DS; \
+        set_fs(USER_DS); \
+} while(0) 
+#include <linux/module.h>
+MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries."); 
+MODULE_AUTHOR("Eric Youngdale, Andi Kleen");
+#undef MODULE_DESCRIPTION
+#undef MODULE_AUTHOR
+static void elf32_init(struct pt_regs *);
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+#define arch_setup_additional_pages syscall32_setup_pages
+extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
+#include "../../../fs/binfmt_elf.c" 
+static void elf32_init(struct pt_regs *regs)
+{
+        struct task_struct *me = current; 
+        regs->rdi = 0;
+        regs->rsi = 0;
+        regs->rdx = 0;
+        regs->rcx = 0;
+        regs->rax = 0;
+        regs->rbx = 0; 
+        regs->rbp = 0; 
+        regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
+                regs->r13 = regs->r14 = regs->r15 = 0; 
+    me->thread.fs = 0; 
+        me->thread.gs = 0;
+        me->thread.fsindex = 0; 
+        me->thread.gsindex = 0;
+    me->thread.ds = __USER_DS; 
+        me->thread.es = __USER_DS;
+}
+#ifdef CONFIG_SYSCTL
+/* Register vsyscall32 into the ABI table */
+#include <linux/sysctl.h>
+static ctl_table abi_table2[] = {
+        {
+                .ctl_name       = 99,
+                .procname       = "vsyscall32",
+                .data           = &sysctl_vsyscall32,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec
+        },
+        {}
+};
+static ctl_table abi_root_table2[] = {
+        {
+                .ctl_name = CTL_ABI,
+                .procname = "abi",
+                .mode = 0555,
+                .child = abi_table2
+        },
+        {}
+};
+static __init int ia32_binfmt_init(void)
+{ 
+        register_sysctl_table(abi_root_table2);
+        return 0;
+}
+__initcall(ia32_binfmt_init);
+#endif
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
new file mode 100644
index 000000000000..6ea19c25f90d
--- /dev/null
+++ b/arch/x86/ia32/ia32_signal.c
@@ -0,0 +1,617 @@
+/*
+ *  linux/arch/x86_64/ia32/ia32_signal.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
+ *  2000-12-*   x86-64 compatibility mode signal handling by Andi Kleen
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/ptrace.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/compat.h>
+#include <linux/binfmts.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include <asm/ia32.h>
+#include <asm/ptrace.h>
+#include <asm/ia32_unistd.h>
+#include <asm/user32.h>
+#include <asm/sigcontext32.h>
+#include <asm/fpu32.h>
+#include <asm/proto.h>
+#include <asm/vsyscall32.h>
+#define DEBUG_SIG 0
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+{
+        int err;
+        if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
+                return -EFAULT;
+        /* If you change siginfo_t structure, please make sure that
+           this code is fixed accordingly.
+           It should never copy any pad contained in the structure
+           to avoid security leaks, but must copy the generic
+           3 ints plus the relevant union member.  */
+        err = __put_user(from->si_signo, &to->si_signo);
+        err |= __put_user(from->si_errno, &to->si_errno);
+        err |= __put_user((short)from->si_code, &to->si_code);
+        if (from->si_code < 0) {
+                err |= __put_user(from->si_pid, &to->si_pid);
+                err |= __put_user(from->si_uid, &to->si_uid);
+                err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr);
+        } else {
+                /* First 32bits of unions are always present:
+                 * si_pid === si_band === si_tid === si_addr(LS half) */
+                err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]);
+                switch (from->si_code >> 16) {
+                case __SI_FAULT >> 16:
+                        break;
+                case __SI_CHLD >> 16:
+                        err |= __put_user(from->si_utime, &to->si_utime);
+                        err |= __put_user(from->si_stime, &to->si_stime);
+                        err |= __put_user(from->si_status, &to->si_status);
+                        /* FALL THROUGH */
+                default:
+                case __SI_KILL >> 16:
+                        err |= __put_user(from->si_uid, &to->si_uid);
+                        break;
+                case __SI_POLL >> 16:
+                        err |= __put_user(from->si_fd, &to->si_fd); 
+                        break;
+                case __SI_TIMER >> 16:
+                        err |= __put_user(from->si_overrun, &to->si_overrun); 
+                        err |= __put_user(ptr_to_compat(from->si_ptr),
+                                        &to->si_ptr);
+                        break;
+                case __SI_RT >> 16: /* This is not generated by the kernel as of now.  */
+                case __SI_MESGQ >> 16:
+                        err |= __put_user(from->si_uid, &to->si_uid);
+                        err |= __put_user(from->si_int, &to->si_int);
+                        break;
+                }
+        }
+        return err;
+}
+int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
+{
+        int err;
+        u32 ptr32;
+        if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t)))
+                return -EFAULT;
+        err = __get_user(to->si_signo, &from->si_signo);
+        err |= __get_user(to->si_errno, &from->si_errno);
+        err |= __get_user(to->si_code, &from->si_code);
+        err |= __get_user(to->si_pid, &from->si_pid);
+        err |= __get_user(to->si_uid, &from->si_uid);
+        err |= __get_user(ptr32, &from->si_ptr);
+        to->si_ptr = compat_ptr(ptr32);
+        return err;
+}
+asmlinkage long
+sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
+{
+        mask &= _BLOCKABLE;
+        spin_lock_irq(&current->sighand->siglock);
+        current->saved_sigmask = current->blocked;
+        siginitset(&current->blocked, mask);
+        recalc_sigpending();
+        spin_unlock_irq(&current->sighand->siglock);
+        current->state = TASK_INTERRUPTIBLE;
+        schedule();
+        set_thread_flag(TIF_RESTORE_SIGMASK);
+        return -ERESTARTNOHAND;
+}
+asmlinkage long
+sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
+                  stack_ia32_t __user *uoss_ptr, 
+                  struct pt_regs *regs)
+{
+        stack_t uss,uoss; 
+        int ret;
+        mm_segment_t seg; 
+        if (uss_ptr) { 
+                u32 ptr;
+                memset(&uss,0,sizeof(stack_t));
+                if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) ||
+                            __get_user(ptr, &uss_ptr->ss_sp) ||
+                            __get_user(uss.ss_flags, &uss_ptr->ss_flags) ||
+                            __get_user(uss.ss_size, &uss_ptr->ss_size))
+                        return -EFAULT;
+                uss.ss_sp = compat_ptr(ptr);
+        }
+        seg = get_fs(); 
+        set_fs(KERNEL_DS); 
+        ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp);
+        set_fs(seg); 
+        if (ret >= 0 && uoss_ptr)  {
+                if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) ||
+                    __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
+                    __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
+                    __put_user(uoss.ss_size, &uoss_ptr->ss_size))
+                        ret = -EFAULT;
+        }       
+        return ret;     
+}
+/*
+ * Do a signal return; undo the signal stack.
+ */
+struct sigframe
+{
+        u32 pretcode;
+        int sig;
+        struct sigcontext_ia32 sc;
+        struct _fpstate_ia32 fpstate;
+        unsigned int extramask[_COMPAT_NSIG_WORDS-1];
+        char retcode[8];
+};
+struct rt_sigframe
+{
+        u32 pretcode;
+        int sig;
+        u32 pinfo;
+        u32 puc;
+        compat_siginfo_t info;
+        struct ucontext_ia32 uc;
+        struct _fpstate_ia32 fpstate;
+        char retcode[8];
+};
+static int
+ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax)
+{
+        unsigned int err = 0;
+        
+        /* Always make any pending restarted system calls return -EINTR */
+        current_thread_info()->restart_block.fn = do_no_restart_syscall;
+#if DEBUG_SIG
+        printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n",
+                sc, sc->err, sc->eip, sc->cs, sc->eflags);
+#endif
+#define COPY(x)         { \
+        unsigned int reg;                       \
+        err |= __get_user(reg, &sc->e ##x);     \
+        regs->r ## x = reg;                     \
+}
+#define RELOAD_SEG(seg,mask)                                            \
+        { unsigned int cur;                                             \
+          unsigned short pre;                                           \
+          err |= __get_user(pre, &sc->seg);                             \
+          asm volatile("movl %%" #seg ",%0" : "=r" (cur));              \
+          pre |= mask;                                                  \
+          if (pre != cur) loadsegment(seg,pre); }
+        /* Reload fs and gs if they have changed in the signal handler.
+           This does not handle long fs/gs base changes in the handler, but 
+           does not clobber them at least in the normal case. */ 
+        
+        {
+                unsigned gs, oldgs; 
+                err |= __get_user(gs, &sc->gs);
+                gs |= 3; 
+                asm("movl %%gs,%0" : "=r" (oldgs));
+                if (gs != oldgs)
+                load_gs_index(gs); 
+        } 
+        RELOAD_SEG(fs,3);
+        RELOAD_SEG(ds,3);
+        RELOAD_SEG(es,3);
+        COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
+        COPY(dx); COPY(cx); COPY(ip);
+        /* Don't touch extended registers */ 
+        
+        err |= __get_user(regs->cs, &sc->cs); 
+        regs->cs |= 3;  
+        err |= __get_user(regs->ss, &sc->ss); 
+        regs->ss |= 3; 
+        {
+                unsigned int tmpflags;
+                err |= __get_user(tmpflags, &sc->eflags);
+                regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
+                regs->orig_rax = -1;            /* disable syscall checks */
+        }
+        {
+                u32 tmp;
+                struct _fpstate_ia32 __user * buf;
+                err |= __get_user(tmp, &sc->fpstate);
+                buf = compat_ptr(tmp);
+                if (buf) {
+                        if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
+                                goto badframe;
+                        err |= restore_i387_ia32(current, buf, 0);
+                } else {
+                        struct task_struct *me = current;
+                        if (used_math()) {
+                                clear_fpu(me);
+                                clear_used_math();
+                        }
+                }
+        }
+        { 
+                u32 tmp;
+                err |= __get_user(tmp, &sc->eax);
+                *peax = tmp;
+        }
+        return err;
+badframe:
+        return 1;
+}
+asmlinkage long sys32_sigreturn(struct pt_regs *regs)
+{
+        struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8);
+        sigset_t set;
+        unsigned int eax;
+        if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+                goto badframe;
+        if (__get_user(set.sig[0], &frame->sc.oldmask)
+            || (_COMPAT_NSIG_WORDS > 1
+                && __copy_from_user((((char *) &set.sig) + 4), &frame->extramask,
+                                    sizeof(frame->extramask))))
+                goto badframe;
+        sigdelsetmask(&set, ~_BLOCKABLE);
+        spin_lock_irq(&current->sighand->siglock);
+        current->blocked = set;
+        recalc_sigpending();
+        spin_unlock_irq(&current->sighand->siglock);
+        
+        if (ia32_restore_sigcontext(regs, &frame->sc, &eax))
+                goto badframe;
+        return eax;
+badframe:
+        signal_fault(regs, frame, "32bit sigreturn");
+        return 0;
+}       
+asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
+{
+        struct rt_sigframe __user *frame;
+        sigset_t set;
+        unsigned int eax;
+        struct pt_regs tregs;
+        frame = (struct rt_sigframe __user *)(regs->rsp - 4);
+        if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+                goto badframe;
+        if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+                goto badframe;
+        sigdelsetmask(&set, ~_BLOCKABLE);
+        spin_lock_irq(&current->sighand->siglock);
+        current->blocked = set;
+        recalc_sigpending();
+        spin_unlock_irq(&current->sighand->siglock);
+        
+        if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+                goto badframe;
+        tregs = *regs;
+        if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
+                goto badframe;
+        return eax;
+badframe:
+        signal_fault(regs,frame,"32bit rt sigreturn");
+        return 0;
+}       
+/*
+ * Set up a signal frame.
+ */
+static int
+ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate,
+                 struct pt_regs *regs, unsigned int mask)
+{
+        int tmp, err = 0;
+        tmp = 0;
+        __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
+        err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
+        __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
+        err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
+        __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp));
+        err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
+        __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
+        err |= __put_user(tmp, (unsigned int __user *)&sc->es);
+        err |= __put_user((u32)regs->rdi, &sc->edi);
+        err |= __put_user((u32)regs->rsi, &sc->esi);
+        err |= __put_user((u32)regs->rbp, &sc->ebp);
+        err |= __put_user((u32)regs->rsp, &sc->esp);
+        err |= __put_user((u32)regs->rbx, &sc->ebx);
+        err |= __put_user((u32)regs->rdx, &sc->edx);
+        err |= __put_user((u32)regs->rcx, &sc->ecx);
+        err |= __put_user((u32)regs->rax, &sc->eax);
+        err |= __put_user((u32)regs->cs, &sc->cs);
+        err |= __put_user((u32)regs->ss, &sc->ss);
+        err |= __put_user(current->thread.trap_no, &sc->trapno);
+        err |= __put_user(current->thread.error_code, &sc->err);
+        err |= __put_user((u32)regs->rip, &sc->eip);
+        err |= __put_user((u32)regs->eflags, &sc->eflags);
+        err |= __put_user((u32)regs->rsp, &sc->esp_at_signal);
+        tmp = save_i387_ia32(current, fpstate, regs, 0);
+        if (tmp < 0)
+                err = -EFAULT;
+        else { 
+                clear_used_math();
+                stts();
+                err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
+                                        &sc->fpstate);
+        }
+        /* non-iBCS2 extensions.. */
+        err |= __put_user(mask, &sc->oldmask);
+        err |= __put_user(current->thread.cr2, &sc->cr2);
+        return err;
+}
+/*
+ * Determine which stack to use..
+ */
+static void __user *
+get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
+{
+        unsigned long rsp;
+        /* Default to using normal stack */
+        rsp = regs->rsp;
+        /* This is the X/Open sanctioned signal stack switching.  */
+        if (ka->sa.sa_flags & SA_ONSTACK) {
+                if (sas_ss_flags(rsp) == 0)
+                        rsp = current->sas_ss_sp + current->sas_ss_size;
+        }
+        /* This is the legacy signal stack switching. */
+        else if ((regs->ss & 0xffff) != __USER_DS &&
+                !(ka->sa.sa_flags & SA_RESTORER) &&
+                 ka->sa.sa_restorer) {
+                rsp = (unsigned long) ka->sa.sa_restorer;
+        }
+        rsp -= frame_size;
+        /* Align the stack pointer according to the i386 ABI,
+         * i.e. so that on function entry ((sp + 4) & 15) == 0. */
+        rsp = ((rsp + 4) & -16ul) - 4;
+        return (void __user *) rsp;
+}
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
+                     compat_sigset_t *set, struct pt_regs * regs)
+{
+        struct sigframe __user *frame;
+        int err = 0;
+        frame = get_sigframe(ka, regs, sizeof(*frame));
+        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+                goto give_sigsegv;
+        err |= __put_user(sig, &frame->sig);
+        if (err)
+                goto give_sigsegv;
+        err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs,
+                                        set->sig[0]);
+        if (err)
+                goto give_sigsegv;
+        if (_COMPAT_NSIG_WORDS > 1) {
+                err |= __copy_to_user(frame->extramask, &set->sig[1],
+                                      sizeof(frame->extramask));
+        }
+        if (err)
+                goto give_sigsegv;
+        /* Return stub is in 32bit vsyscall page */
+        { 
+                void __user *restorer;
+                if (current->binfmt->hasvdso)
+                        restorer = VSYSCALL32_SIGRETURN;
+                else
+                        restorer = (void *)&frame->retcode;
+                if (ka->sa.sa_flags & SA_RESTORER)
+                        restorer = ka->sa.sa_restorer;       
+                err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
+        }
+        /* These are actually not used anymore, but left because some 
+           gdb versions depend on them as a marker. */
+        { 
+                /* copy_to_user optimizes that into a single 8 byte store */
+                static const struct { 
+                        u16 poplmovl;
+                        u32 val;
+                        u16 int80;    
+                        u16 pad; 
+                } __attribute__((packed)) code = { 
+                        0xb858,          /* popl %eax ; movl $...,%eax */
+                        __NR_ia32_sigreturn,   
+                        0x80cd,         /* int $0x80 */
+                        0,
+                }; 
+                err |= __copy_to_user(frame->retcode, &code, 8); 
+        }
+        if (err)
+                goto give_sigsegv;
+        /* Set up registers for signal handler */
+        regs->rsp = (unsigned long) frame;
+        regs->rip = (unsigned long) ka->sa.sa_handler;
+        /* Make -mregparm=3 work */
+        regs->rax = sig;
+        regs->rdx = 0;
+        regs->rcx = 0;
+        asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 
+        asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 
+        regs->cs = __USER32_CS; 
+        regs->ss = __USER32_DS; 
+        set_fs(USER_DS);
+        regs->eflags &= ~TF_MASK;
+        if (test_thread_flag(TIF_SINGLESTEP))
+                ptrace_notify(SIGTRAP);
+#if DEBUG_SIG
+        printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
+                current->comm, current->pid, frame, regs->rip, frame->pretcode);
+#endif
+        return 0;
+give_sigsegv:
+        force_sigsegv(sig, current);
+        return -EFAULT;
+}
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+                        compat_sigset_t *set, struct pt_regs * regs)
+{
+        struct rt_sigframe __user *frame;
+        int err = 0;
+        frame = get_sigframe(ka, regs, sizeof(*frame));
+        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+                goto give_sigsegv;
+        {
+                struct exec_domain *ed = current_thread_info()->exec_domain;
+                err |= __put_user((ed
+                           && ed->signal_invmap
+                           && sig < 32
+                           ? ed->signal_invmap[sig]
+                           : sig),
+                          &frame->sig);
+        }
+        err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
+        err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
+        err |= copy_siginfo_to_user32(&frame->info, info);
+        if (err)
+                goto give_sigsegv;
+        /* Create the ucontext.  */
+        err |= __put_user(0, &frame->uc.uc_flags);
+        err |= __put_user(0, &frame->uc.uc_link);
+        err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+        err |= __put_user(sas_ss_flags(regs->rsp),
+                          &frame->uc.uc_stack.ss_flags);
+        err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+        err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
+                                regs, set->sig[0]);
+        err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+        if (err)
+                goto give_sigsegv;
+        
+        { 
+                void __user *restorer = VSYSCALL32_RTSIGRETURN; 
+                if (ka->sa.sa_flags & SA_RESTORER)
+                        restorer = ka->sa.sa_restorer;       
+                err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
+        }
+        /* This is movl $,%eax ; int $0x80 */
+        /* Not actually used anymore, but left because some gdb versions
+           need it. */ 
+        { 
+                /* __copy_to_user optimizes that into a single 8 byte store */
+                static const struct { 
+                        u8 movl; 
+                        u32 val; 
+                        u16 int80; 
+                        u16 pad;
+                        u8  pad2;                               
+                } __attribute__((packed)) code = { 
+                        0xb8,
+                        __NR_ia32_rt_sigreturn,
+                        0x80cd,
+                        0,
+                }; 
+                err |= __copy_to_user(frame->retcode, &code, 8); 
+        } 
+        if (err)
+                goto give_sigsegv;
+        /* Set up registers for signal handler */
+        regs->rsp = (unsigned long) frame;
+        regs->rip = (unsigned long) ka->sa.sa_handler;
+        /* Make -mregparm=3 work */
+        regs->rax = sig;
+        regs->rdx = (unsigned long) &frame->info;
+        regs->rcx = (unsigned long) &frame->uc;
+        /* Make -mregparm=3 work */
+        regs->rax = sig;
+        regs->rdx = (unsigned long) &frame->info;
+        regs->rcx = (unsigned long) &frame->uc;
+        asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 
+        asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 
+        
+        regs->cs = __USER32_CS; 
+        regs->ss = __USER32_DS; 
+        set_fs(USER_DS);
+        regs->eflags &= ~TF_MASK;
+        if (test_thread_flag(TIF_SINGLESTEP))
+                ptrace_notify(SIGTRAP);
+#if DEBUG_SIG
+        printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
+                current->comm, current->pid, frame, regs->rip, frame->pretcode);
+#endif
+        return 0;
+give_sigsegv:
+        force_sigsegv(sig, current);
+        return -EFAULT;
+}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
new file mode 100644
index 000000000000..18b231810908
--- /dev/null
+++ b/arch/x86/ia32/ia32entry.S
@@ -0,0 +1,736 @@
+/*
+ * Compatibility mode system call entry point for x86-64. 
+ *              
+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+ */              
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/asm-offsets.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/ia32_unistd.h>    
+#include <asm/thread_info.h>    
+#include <asm/segment.h>
+#include <asm/vsyscall32.h>
+#include <asm/irqflags.h>
+#include <linux/linkage.h>
+#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
+        .macro IA32_ARG_FIXUP noebp=0
+        movl    %edi,%r8d
+        .if \noebp
+        .else
+        movl    %ebp,%r9d
+        .endif
+        xchg    %ecx,%esi
+        movl    %ebx,%edi
+        movl    %edx,%edx       /* zero extension */
+        .endm 
+        /* clobbers %eax */     
+        .macro  CLEAR_RREGS
+        xorl    %eax,%eax
+        movq    %rax,R11(%rsp)
+        movq    %rax,R10(%rsp)
+        movq    %rax,R9(%rsp)
+        movq    %rax,R8(%rsp)
+        .endm
+        .macro LOAD_ARGS32 offset
+        movl \offset(%rsp),%r11d
+        movl \offset+8(%rsp),%r10d
+        movl \offset+16(%rsp),%r9d
+        movl \offset+24(%rsp),%r8d
+        movl \offset+40(%rsp),%ecx
+        movl \offset+48(%rsp),%edx
+        movl \offset+56(%rsp),%esi
+        movl \offset+64(%rsp),%edi
+        movl \offset+72(%rsp),%eax
+        .endm
+        
+        .macro CFI_STARTPROC32 simple
+        CFI_STARTPROC   \simple
+        CFI_UNDEFINED   r8
+        CFI_UNDEFINED   r9
+        CFI_UNDEFINED   r10
+        CFI_UNDEFINED   r11
+        CFI_UNDEFINED   r12
+        CFI_UNDEFINED   r13
+        CFI_UNDEFINED   r14
+        CFI_UNDEFINED   r15
+        .endm
+/*
+ * 32bit SYSENTER instruction entry.
+ *
+ * Arguments:
+ * %eax System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp user stack
+ * 0(%ebp) Arg6 
+ *      
+ * Interrupts off.
+ *      
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below.  Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */     
+ENTRY(ia32_sysenter_target)
+        CFI_STARTPROC32 simple
+        CFI_SIGNAL_FRAME
+        CFI_DEF_CFA     rsp,0
+        CFI_REGISTER    rsp,rbp
+        swapgs
+        movq    %gs:pda_kernelstack, %rsp
+        addq    $(PDA_STACKOFFSET),%rsp 
+        /*
+         * No need to follow this irqs on/off section: the syscall
+         * disabled irqs, here we enable it straight after entry:
+         */
+        sti     
+        movl    %ebp,%ebp               /* zero extension */
+        pushq   $__USER32_DS
+        CFI_ADJUST_CFA_OFFSET 8
+        /*CFI_REL_OFFSET ss,0*/
+        pushq   %rbp
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rsp,0
+        pushfq
+        CFI_ADJUST_CFA_OFFSET 8
+        /*CFI_REL_OFFSET rflags,0*/
+        movl    $VSYSCALL32_SYSEXIT, %r10d
+        CFI_REGISTER rip,r10
+        pushq   $__USER32_CS
+        CFI_ADJUST_CFA_OFFSET 8
+        /*CFI_REL_OFFSET cs,0*/
+        movl    %eax, %eax
+        pushq   %r10
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rip,0
+        pushq   %rax
+        CFI_ADJUST_CFA_OFFSET 8
+        cld
+        SAVE_ARGS 0,0,1
+        /* no need to do an access_ok check here because rbp has been
+           32bit zero extended */ 
+1:      movl    (%rbp),%r9d
+        .section __ex_table,"a"
+        .quad 1b,ia32_badarg
+        .previous       
+        GET_THREAD_INFO(%r10)
+        orl    $TS_COMPAT,threadinfo_status(%r10)
+        testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+        CFI_REMEMBER_STATE
+        jnz  sysenter_tracesys
+sysenter_do_call:       
+        cmpl    $(IA32_NR_syscalls-1),%eax
+        ja      ia32_badsys
+        IA32_ARG_FIXUP 1
+        call    *ia32_sys_call_table(,%rax,8)
+        movq    %rax,RAX-ARGOFFSET(%rsp)
+        GET_THREAD_INFO(%r10)
+        cli
+        TRACE_IRQS_OFF
+        testl   $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
+        jnz     int_ret_from_sys_call
+        andl    $~TS_COMPAT,threadinfo_status(%r10)
+        /* clear IF, that popfq doesn't enable interrupts early */
+        andl  $~0x200,EFLAGS-R11(%rsp) 
+        RESTORE_ARGS 1,24,1,1,1,1
+        popfq
+        CFI_ADJUST_CFA_OFFSET -8
+        /*CFI_RESTORE rflags*/
+        popq    %rcx                            /* User %esp */
+        CFI_ADJUST_CFA_OFFSET -8
+        CFI_REGISTER rsp,rcx
+        movl    $VSYSCALL32_SYSEXIT,%edx        /* User %eip */
+        CFI_REGISTER rip,rdx
+        TRACE_IRQS_ON
+        swapgs
+        sti             /* sti only takes effect after the next instruction */
+        /* sysexit */
+        .byte   0xf, 0x35
+sysenter_tracesys:
+        CFI_RESTORE_STATE
+        SAVE_REST
+        CLEAR_RREGS
+        movq    $-ENOSYS,RAX(%rsp)      /* really needed? */
+        movq    %rsp,%rdi        /* &pt_regs -> arg1 */
+        call    syscall_trace_enter
+        LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+        RESTORE_REST
+        movl    %ebp, %ebp
+        /* no need to do an access_ok check here because rbp has been
+           32bit zero extended */ 
+1:      movl    (%rbp),%r9d
+        .section __ex_table,"a"
+        .quad 1b,ia32_badarg
+        .previous
+        jmp     sysenter_do_call
+        CFI_ENDPROC
+ENDPROC(ia32_sysenter_target)
+/*
+ * 32bit SYSCALL instruction entry.
+ *
+ * Arguments:
+ * %eax System call number.
+ * %ebx Arg1
+ * %ecx return EIP 
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
+ * %esp user stack 
+ * 0(%esp) Arg6
+ *      
+ * Interrupts off.
+ *      
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below.  Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.      
+ */     
+ENTRY(ia32_cstar_target)
+        CFI_STARTPROC32 simple
+        CFI_SIGNAL_FRAME
+        CFI_DEF_CFA     rsp,PDA_STACKOFFSET
+        CFI_REGISTER    rip,rcx
+        /*CFI_REGISTER  rflags,r11*/
+        swapgs
+        movl    %esp,%r8d
+        CFI_REGISTER    rsp,r8
+        movq    %gs:pda_kernelstack,%rsp
+        /*
+         * No need to follow this irqs on/off section: the syscall
+         * disabled irqs and here we enable it straight after entry:
+         */
+        sti
+        SAVE_ARGS 8,1,1
+        movl    %eax,%eax       /* zero extension */
+        movq    %rax,ORIG_RAX-ARGOFFSET(%rsp)
+        movq    %rcx,RIP-ARGOFFSET(%rsp)
+        CFI_REL_OFFSET rip,RIP-ARGOFFSET
+        movq    %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+        movl    %ebp,%ecx
+        movq    $__USER32_CS,CS-ARGOFFSET(%rsp)
+        movq    $__USER32_DS,SS-ARGOFFSET(%rsp)
+        movq    %r11,EFLAGS-ARGOFFSET(%rsp)
+        /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
+        movq    %r8,RSP-ARGOFFSET(%rsp) 
+        CFI_REL_OFFSET rsp,RSP-ARGOFFSET
+        /* no need to do an access_ok check here because r8 has been
+           32bit zero extended */ 
+        /* hardware stack frame is complete now */      
+1:      movl    (%r8),%r9d
+        .section __ex_table,"a"
+        .quad 1b,ia32_badarg
+        .previous       
+        GET_THREAD_INFO(%r10)
+        orl   $TS_COMPAT,threadinfo_status(%r10)
+        testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+        CFI_REMEMBER_STATE
+        jnz   cstar_tracesys
+cstar_do_call:  
+        cmpl $IA32_NR_syscalls-1,%eax
+        ja  ia32_badsys
+        IA32_ARG_FIXUP 1
+        call *ia32_sys_call_table(,%rax,8)
+        movq %rax,RAX-ARGOFFSET(%rsp)
+        GET_THREAD_INFO(%r10)
+        cli
+        TRACE_IRQS_OFF
+        testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
+        jnz  int_ret_from_sys_call
+        andl $~TS_COMPAT,threadinfo_status(%r10)
+        RESTORE_ARGS 1,-ARG_SKIP,1,1,1
+        movl RIP-ARGOFFSET(%rsp),%ecx
+        CFI_REGISTER rip,rcx
+        movl EFLAGS-ARGOFFSET(%rsp),%r11d       
+        /*CFI_REGISTER rflags,r11*/
+        TRACE_IRQS_ON
+        movl RSP-ARGOFFSET(%rsp),%esp
+        CFI_RESTORE rsp
+        swapgs
+        sysretl
+        
+cstar_tracesys: 
+        CFI_RESTORE_STATE
+        SAVE_REST
+        CLEAR_RREGS
+        movq $-ENOSYS,RAX(%rsp) /* really needed? */
+        movq %rsp,%rdi        /* &pt_regs -> arg1 */
+        call syscall_trace_enter
+        LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+        RESTORE_REST
+        movl RSP-ARGOFFSET(%rsp), %r8d
+        /* no need to do an access_ok check here because r8 has been
+           32bit zero extended */ 
+1:      movl    (%r8),%r9d
+        .section __ex_table,"a"
+        .quad 1b,ia32_badarg
+        .previous
+        jmp cstar_do_call
+END(ia32_cstar_target)
+                                
+ia32_badarg:
+        movq $-EFAULT,%rax
+        jmp ia32_sysret
+        CFI_ENDPROC
+/* 
+ * Emulated IA32 system calls via int 0x80. 
+ *
+ * Arguments:    
+ * %eax System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
+ *
+ * Notes:
+ * Uses the same stack frame as the x86-64 version.     
+ * All registers except %eax must be saved (but ptrace may violate that)
+ * Arguments are zero extended. For system calls that want sign extension and
+ * take long arguments a wrapper is needed. Most calls can just be called
+ * directly.
+ * Assumes it is only called from user space and entered with interrupts off.   
+ */                             
+ENTRY(ia32_syscall)
+        CFI_STARTPROC32 simple
+        CFI_SIGNAL_FRAME
+        CFI_DEF_CFA     rsp,SS+8-RIP
+        /*CFI_REL_OFFSET        ss,SS-RIP*/
+        CFI_REL_OFFSET  rsp,RSP-RIP
+        /*CFI_REL_OFFSET        rflags,EFLAGS-RIP*/
+        /*CFI_REL_OFFSET        cs,CS-RIP*/
+        CFI_REL_OFFSET  rip,RIP-RIP
+        swapgs
+        /*
+         * No need to follow this irqs on/off section: the syscall
+         * disabled irqs and here we enable it straight after entry:
+         */
+        sti
+        movl %eax,%eax
+        pushq %rax
+        CFI_ADJUST_CFA_OFFSET 8
+        cld
+        /* note the registers are not zero extended to the sf.
+           this could be a problem. */
+        SAVE_ARGS 0,0,1
+        GET_THREAD_INFO(%r10)
+        orl   $TS_COMPAT,threadinfo_status(%r10)
+        testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+        jnz ia32_tracesys
+ia32_do_syscall:        
+        cmpl $(IA32_NR_syscalls-1),%eax
+        ja  ia32_badsys
+        IA32_ARG_FIXUP
+        call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
+ia32_sysret:
+        movq %rax,RAX-ARGOFFSET(%rsp)
+        jmp int_ret_from_sys_call 
+ia32_tracesys:                   
+        SAVE_REST
+        CLEAR_RREGS
+        movq $-ENOSYS,RAX(%rsp) /* really needed? */
+        movq %rsp,%rdi        /* &pt_regs -> arg1 */
+        call syscall_trace_enter
+        LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+        RESTORE_REST
+        jmp ia32_do_syscall
+END(ia32_syscall)
+ia32_badsys:
+        movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+        movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+        jmp int_ret_from_sys_call
+quiet_ni_syscall:
+        movq $-ENOSYS,%rax
+        ret
+        CFI_ENDPROC
+        
+        .macro PTREGSCALL label, func, arg
+        .globl \label
+\label:
+        leaq \func(%rip),%rax
+        leaq -ARGOFFSET+8(%rsp),\arg    /* 8 for return address */
+        jmp  ia32_ptregs_common 
+        .endm
+        CFI_STARTPROC32
+        PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
+        PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
+        PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
+        PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
+        PTREGSCALL stub32_execve, sys32_execve, %rcx
+        PTREGSCALL stub32_fork, sys_fork, %rdi
+        PTREGSCALL stub32_clone, sys32_clone, %rdx
+        PTREGSCALL stub32_vfork, sys_vfork, %rdi
+        PTREGSCALL stub32_iopl, sys_iopl, %rsi
+        PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
+ENTRY(ia32_ptregs_common)
+        popq %r11
+        CFI_ENDPROC
+        CFI_STARTPROC32 simple
+        CFI_SIGNAL_FRAME
+        CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
+        CFI_REL_OFFSET  rax,RAX-ARGOFFSET
+        CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
+        CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
+        CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
+        CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
+        CFI_REL_OFFSET  rip,RIP-ARGOFFSET
+/*      CFI_REL_OFFSET  cs,CS-ARGOFFSET*/
+/*      CFI_REL_OFFSET  rflags,EFLAGS-ARGOFFSET*/
+        CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
+/*      CFI_REL_OFFSET  ss,SS-ARGOFFSET*/
+        SAVE_REST
+        call *%rax
+        RESTORE_REST
+        jmp  ia32_sysret        /* misbalances the return cache */
+        CFI_ENDPROC
+END(ia32_ptregs_common)
+        .section .rodata,"a"
+        .align 8
+ia32_sys_call_table:
+        .quad sys_restart_syscall
+        .quad sys_exit
+        .quad stub32_fork
+        .quad sys_read
+        .quad sys_write
+        .quad compat_sys_open           /* 5 */
+        .quad sys_close
+        .quad sys32_waitpid
+        .quad sys_creat
+        .quad sys_link
+        .quad sys_unlink                /* 10 */
+        .quad stub32_execve
+        .quad sys_chdir
+        .quad compat_sys_time
+        .quad sys_mknod
+        .quad sys_chmod         /* 15 */
+        .quad sys_lchown16
+        .quad quiet_ni_syscall                  /* old break syscall holder */
+        .quad sys_stat
+        .quad sys32_lseek
+        .quad sys_getpid                /* 20 */
+        .quad compat_sys_mount  /* mount  */
+        .quad sys_oldumount     /* old_umount  */
+        .quad sys_setuid16
+        .quad sys_getuid16
+        .quad compat_sys_stime  /* stime */             /* 25 */
+        .quad sys32_ptrace      /* ptrace */
+        .quad sys_alarm
+        .quad sys_fstat /* (old)fstat */
+        .quad sys_pause
+        .quad compat_sys_utime  /* 30 */
+        .quad quiet_ni_syscall  /* old stty syscall holder */
+        .quad quiet_ni_syscall  /* old gtty syscall holder */
+        .quad sys_access
+        .quad sys_nice  
+        .quad quiet_ni_syscall  /* 35 */        /* old ftime syscall holder */
+        .quad sys_sync
+        .quad sys32_kill
+        .quad sys_rename
+        .quad sys_mkdir
+        .quad sys_rmdir         /* 40 */
+        .quad sys_dup
+        .quad sys32_pipe
+        .quad compat_sys_times
+        .quad quiet_ni_syscall                  /* old prof syscall holder */
+        .quad sys_brk           /* 45 */
+        .quad sys_setgid16
+        .quad sys_getgid16
+        .quad sys_signal
+        .quad sys_geteuid16
+        .quad sys_getegid16     /* 50 */
+        .quad sys_acct
+        .quad sys_umount                        /* new_umount */
+        .quad quiet_ni_syscall                  /* old lock syscall holder */
+        .quad compat_sys_ioctl
+        .quad compat_sys_fcntl64                /* 55 */
+        .quad quiet_ni_syscall                  /* old mpx syscall holder */
+        .quad sys_setpgid
+        .quad quiet_ni_syscall                  /* old ulimit syscall holder */
+        .quad sys32_olduname
+        .quad sys_umask         /* 60 */
+        .quad sys_chroot
+        .quad sys32_ustat
+        .quad sys_dup2
+        .quad sys_getppid
+        .quad sys_getpgrp               /* 65 */
+        .quad sys_setsid
+        .quad sys32_sigaction
+        .quad sys_sgetmask
+        .quad sys_ssetmask
+        .quad sys_setreuid16    /* 70 */
+        .quad sys_setregid16
+        .quad stub32_sigsuspend
+        .quad compat_sys_sigpending
+        .quad sys_sethostname
+        .quad compat_sys_setrlimit      /* 75 */
+        .quad compat_sys_old_getrlimit  /* old_getrlimit */
+        .quad compat_sys_getrusage
+        .quad sys32_gettimeofday
+        .quad sys32_settimeofday
+        .quad sys_getgroups16   /* 80 */
+        .quad sys_setgroups16
+        .quad sys32_old_select
+        .quad sys_symlink
+        .quad sys_lstat
+        .quad sys_readlink              /* 85 */
+        .quad sys_uselib
+        .quad sys_swapon
+        .quad sys_reboot
+        .quad compat_sys_old_readdir
+        .quad sys32_mmap                /* 90 */
+        .quad sys_munmap
+        .quad sys_truncate
+        .quad sys_ftruncate
+        .quad sys_fchmod
+        .quad sys_fchown16              /* 95 */
+        .quad sys_getpriority
+        .quad sys_setpriority
+        .quad quiet_ni_syscall                  /* old profil syscall holder */
+        .quad compat_sys_statfs
+        .quad compat_sys_fstatfs                /* 100 */
+        .quad sys_ioperm
+        .quad compat_sys_socketcall
+        .quad sys_syslog
+        .quad compat_sys_setitimer
+        .quad compat_sys_getitimer      /* 105 */
+        .quad compat_sys_newstat
+        .quad compat_sys_newlstat
+        .quad compat_sys_newfstat
+        .quad sys32_uname
+        .quad stub32_iopl               /* 110 */
+        .quad sys_vhangup
+        .quad quiet_ni_syscall  /* old "idle" system call */
+        .quad sys32_vm86_warning        /* vm86old */ 
+        .quad compat_sys_wait4
+        .quad sys_swapoff               /* 115 */
+        .quad compat_sys_sysinfo
+        .quad sys32_ipc
+        .quad sys_fsync
+        .quad stub32_sigreturn
+        .quad stub32_clone              /* 120 */
+        .quad sys_setdomainname
+        .quad sys_uname
+        .quad sys_modify_ldt
+        .quad compat_sys_adjtimex
+        .quad sys32_mprotect            /* 125 */
+        .quad compat_sys_sigprocmask
+        .quad quiet_ni_syscall          /* create_module */
+        .quad sys_init_module
+        .quad sys_delete_module
+        .quad quiet_ni_syscall          /* 130  get_kernel_syms */
+        .quad sys32_quotactl
+        .quad sys_getpgid
+        .quad sys_fchdir
+        .quad quiet_ni_syscall  /* bdflush */
+        .quad sys_sysfs         /* 135 */
+        .quad sys_personality
+        .quad quiet_ni_syscall  /* for afs_syscall */
+        .quad sys_setfsuid16
+        .quad sys_setfsgid16
+        .quad sys_llseek                /* 140 */
+        .quad compat_sys_getdents
+        .quad compat_sys_select
+        .quad sys_flock
+        .quad sys_msync
+        .quad compat_sys_readv          /* 145 */
+        .quad compat_sys_writev
+        .quad sys_getsid
+        .quad sys_fdatasync
+        .quad sys32_sysctl      /* sysctl */
+        .quad sys_mlock         /* 150 */
+        .quad sys_munlock
+        .quad sys_mlockall
+        .quad sys_munlockall
+        .quad sys_sched_setparam
+        .quad sys_sched_getparam   /* 155 */
+        .quad sys_sched_setscheduler
+        .quad sys_sched_getscheduler
+        .quad sys_sched_yield
+        .quad sys_sched_get_priority_max
+        .quad sys_sched_get_priority_min  /* 160 */
+        .quad sys32_sched_rr_get_interval
+        .quad compat_sys_nanosleep
+        .quad sys_mremap
+        .quad sys_setresuid16
+        .quad sys_getresuid16   /* 165 */
+        .quad sys32_vm86_warning        /* vm86 */ 
+        .quad quiet_ni_syscall  /* query_module */
+        .quad sys_poll
+        .quad compat_sys_nfsservctl
+        .quad sys_setresgid16   /* 170 */
+        .quad sys_getresgid16
+        .quad sys_prctl
+        .quad stub32_rt_sigreturn
+        .quad sys32_rt_sigaction
+        .quad sys32_rt_sigprocmask      /* 175 */
+        .quad sys32_rt_sigpending
+        .quad compat_sys_rt_sigtimedwait
+        .quad sys32_rt_sigqueueinfo
+        .quad stub32_rt_sigsuspend
+        .quad sys32_pread               /* 180 */
+        .quad sys32_pwrite
+        .quad sys_chown16
+        .quad sys_getcwd
+        .quad sys_capget
+        .quad sys_capset
+        .quad stub32_sigaltstack
+        .quad sys32_sendfile
+        .quad quiet_ni_syscall          /* streams1 */
+        .quad quiet_ni_syscall          /* streams2 */
+        .quad stub32_vfork            /* 190 */
+        .quad compat_sys_getrlimit
+        .quad sys32_mmap2
+        .quad sys32_truncate64
+        .quad sys32_ftruncate64
+        .quad sys32_stat64              /* 195 */
+        .quad sys32_lstat64
+        .quad sys32_fstat64
+        .quad sys_lchown
+        .quad sys_getuid
+        .quad sys_getgid                /* 200 */
+        .quad sys_geteuid
+        .quad sys_getegid
+        .quad sys_setreuid
+        .quad sys_setregid
+        .quad sys_getgroups     /* 205 */
+        .quad sys_setgroups
+        .quad sys_fchown
+        .quad sys_setresuid
+        .quad sys_getresuid
+        .quad sys_setresgid     /* 210 */
+        .quad sys_getresgid
+        .quad sys_chown
+        .quad sys_setuid
+        .quad sys_setgid
+        .quad sys_setfsuid              /* 215 */
+        .quad sys_setfsgid
+        .quad sys_pivot_root
+        .quad sys_mincore
+        .quad sys_madvise
+        .quad compat_sys_getdents64     /* 220 getdents64 */
+        .quad compat_sys_fcntl64        
+        .quad quiet_ni_syscall          /* tux */
+        .quad quiet_ni_syscall          /* security */
+        .quad sys_gettid        
+        .quad sys32_readahead   /* 225 */
+        .quad sys_setxattr
+        .quad sys_lsetxattr
+        .quad sys_fsetxattr
+        .quad sys_getxattr
+        .quad sys_lgetxattr     /* 230 */
+        .quad sys_fgetxattr
+        .quad sys_listxattr
+        .quad sys_llistxattr
+        .quad sys_flistxattr
+        .quad sys_removexattr   /* 235 */
+        .quad sys_lremovexattr
+        .quad sys_fremovexattr
+        .quad sys_tkill
+        .quad sys_sendfile64 
+        .quad compat_sys_futex          /* 240 */
+        .quad compat_sys_sched_setaffinity
+        .quad compat_sys_sched_getaffinity
+        .quad sys32_set_thread_area
+        .quad sys32_get_thread_area
+        .quad compat_sys_io_setup       /* 245 */
+        .quad sys_io_destroy
+        .quad compat_sys_io_getevents
+        .quad compat_sys_io_submit
+        .quad sys_io_cancel
+        .quad sys32_fadvise64           /* 250 */
+        .quad quiet_ni_syscall  /* free_huge_pages */
+        .quad sys_exit_group
+        .quad sys32_lookup_dcookie
+        .quad sys_epoll_create
+        .quad sys_epoll_ctl             /* 255 */
+        .quad sys_epoll_wait
+        .quad sys_remap_file_pages
+        .quad sys_set_tid_address
+        .quad compat_sys_timer_create
+        .quad compat_sys_timer_settime  /* 260 */
+        .quad compat_sys_timer_gettime
+        .quad sys_timer_getoverrun
+        .quad sys_timer_delete
+        .quad compat_sys_clock_settime
+        .quad compat_sys_clock_gettime  /* 265 */
+        .quad compat_sys_clock_getres
+        .quad compat_sys_clock_nanosleep
+        .quad compat_sys_statfs64
+        .quad compat_sys_fstatfs64
+        .quad sys_tgkill                /* 270 */
+        .quad compat_sys_utimes
+        .quad sys32_fadvise64_64
+        .quad quiet_ni_syscall  /* sys_vserver */
+        .quad sys_mbind
+        .quad compat_sys_get_mempolicy  /* 275 */
+        .quad sys_set_mempolicy
+        .quad compat_sys_mq_open
+        .quad sys_mq_unlink
+        .quad compat_sys_mq_timedsend
+        .quad compat_sys_mq_timedreceive        /* 280 */
+        .quad compat_sys_mq_notify
+        .quad compat_sys_mq_getsetattr
+        .quad compat_sys_kexec_load     /* reserved for kexec */
+        .quad compat_sys_waitid
+        .quad quiet_ni_syscall          /* 285: sys_altroot */
+        .quad sys_add_key
+        .quad sys_request_key
+        .quad sys_keyctl
+        .quad sys_ioprio_set
+        .quad sys_ioprio_get            /* 290 */
+        .quad sys_inotify_init
+        .quad sys_inotify_add_watch
+        .quad sys_inotify_rm_watch
+        .quad sys_migrate_pages
+        .quad compat_sys_openat         /* 295 */
+        .quad sys_mkdirat
+        .quad sys_mknodat
+        .quad sys_fchownat
+        .quad compat_sys_futimesat
+        .quad sys32_fstatat             /* 300 */
+        .quad sys_unlinkat
+        .quad sys_renameat
+        .quad sys_linkat
+        .quad sys_symlinkat
+        .quad sys_readlinkat            /* 305 */
+        .quad sys_fchmodat
+        .quad sys_faccessat
+        .quad compat_sys_pselect6
+        .quad compat_sys_ppoll
+        .quad sys_unshare               /* 310 */
+        .quad compat_sys_set_robust_list
+        .quad compat_sys_get_robust_list
+        .quad sys_splice
+        .quad sys32_sync_file_range
+        .quad sys_tee                   /* 315 */
+        .quad compat_sys_vmsplice
+        .quad compat_sys_move_pages
+        .quad sys_getcpu
+        .quad sys_epoll_pwait
+        .quad compat_sys_utimensat      /* 320 */
+        .quad compat_sys_signalfd
+        .quad compat_sys_timerfd
+        .quad sys_eventfd
+        .quad sys32_fallocate
+ia32_syscall_end:
diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c
new file mode 100644
index 000000000000..2e1869ec4db4
--- /dev/null
+++ b/arch/x86/ia32/ipc32.c
@@ -0,0 +1,57 @@
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/syscalls.h>
+#include <linux/time.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/ipc.h>
+#include <linux/compat.h>
+#include <asm/ipc.h>
+asmlinkage long
+sys32_ipc(u32 call, int first, int second, int third,
+                compat_uptr_t ptr, u32 fifth)
+{
+        int version;
+        version = call >> 16; /* hack for backward compatibility */
+        call &= 0xffff;
+        switch (call) {
+              case SEMOP:
+                /* struct sembuf is the same on 32 and 64bit :)) */
+                return sys_semtimedop(first, compat_ptr(ptr), second, NULL);
+              case SEMTIMEDOP:
+                return compat_sys_semtimedop(first, compat_ptr(ptr), second,
+                                                compat_ptr(fifth));
+              case SEMGET:
+                return sys_semget(first, second, third);
+              case SEMCTL:
+                return compat_sys_semctl(first, second, third, compat_ptr(ptr));
+              case MSGSND:
+                return compat_sys_msgsnd(first, second, third, compat_ptr(ptr));
+              case MSGRCV:
+                return compat_sys_msgrcv(first, second, fifth, third,
+                                         version, compat_ptr(ptr));
+              case MSGGET:
+                return sys_msgget((key_t) first, second);
+              case MSGCTL:
+                return compat_sys_msgctl(first, second, compat_ptr(ptr));
+              case SHMAT:
+                return compat_sys_shmat(first, second, third, version,
+                                        compat_ptr(ptr));
+                break;
+              case SHMDT:
+                return sys_shmdt(compat_ptr(ptr));
+              case SHMGET:
+                return sys_shmget(first, (unsigned)second, third);
+              case SHMCTL:
+                return compat_sys_shmctl(first, second, compat_ptr(ptr));
+        }
+        return -ENOSYS;
+}
diff --git a/arch/x86/ia32/mmap32.c b/arch/x86/ia32/mmap32.c
new file mode 100644
index 000000000000..e4b84b4a417a
--- /dev/null
+++ b/arch/x86/ia32/mmap32.c
@@ -0,0 +1,79 @@
+/*
+ *  linux/arch/x86_64/ia32/mm/mmap.c
+ *
+ *  flexible mmap layout support
+ *
+ * Based on the i386 version which was
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * Started by Ingo Molnar <mingo@elte.hu>
+ */
+#include <linux/personality.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+/*
+ * Top of mmap area (just below the process stack).
+ *
+ * Leave an at least ~128 MB hole.
+ */
+#define MIN_GAP (128*1024*1024)
+#define MAX_GAP (TASK_SIZE/6*5)
+static inline unsigned long mmap_base(struct mm_struct *mm)
+{
+        unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+        unsigned long random_factor = 0;
+        if (current->flags & PF_RANDOMIZE)
+                random_factor = get_random_int() % (1024*1024);
+        if (gap < MIN_GAP)
+                gap = MIN_GAP;
+        else if (gap > MAX_GAP)
+                gap = MAX_GAP;
+        return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
+}
+/*
+ * This function, called very early during the creation of a new
+ * process VM image, sets up which VM layout function to use:
+ */
+void ia32_pick_mmap_layout(struct mm_struct *mm)
+{
+        /*
+         * Fall back to the standard layout if the personality
+         * bit is set, or if the expected stack growth is unlimited:
+         */
+        if (sysctl_legacy_va_layout ||
+                        (current->personality & ADDR_COMPAT_LAYOUT) ||
+                        current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
+                mm->mmap_base = TASK_UNMAPPED_BASE;
+                mm->get_unmapped_area = arch_get_unmapped_area;
+                mm->unmap_area = arch_unmap_area;
+        } else {
+                mm->mmap_base = mmap_base(mm);
+                mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+                mm->unmap_area = arch_unmap_area_topdown;
+        }
+}
diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c
new file mode 100644
index 000000000000..4a233ad6269c
--- /dev/null
+++ b/arch/x86/ia32/ptrace32.c
@@ -0,0 +1,404 @@
+/* 
+ * 32bit ptrace for x86-64.
+ *
+ * Copyright 2001,2002 Andi Kleen, SuSE Labs.
+ * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier 
+ * copyright.
+ * 
+ * This allows to access 64bit processes too; but there is no way to see the extended 
+ * register contents.
+ */ 
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/ptrace.h>
+#include <asm/ptrace.h>
+#include <asm/compat.h>
+#include <asm/uaccess.h>
+#include <asm/user32.h>
+#include <asm/user.h>
+#include <asm/errno.h>
+#include <asm/debugreg.h>
+#include <asm/i387.h>
+#include <asm/fpu32.h>
+#include <asm/ia32.h>
+/*
+ * Determines which flags the user has access to [1 = access, 0 = no access].
+ * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
+ * Also masks reserved bits (31-22, 15, 5, 3, 1).
+ */
+#define FLAG_MASK 0x54dd5UL
+#define R32(l,q) \
+        case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break
+static int putreg32(struct task_struct *child, unsigned regno, u32 val)
+{
+        int i;
+        __u64 *stack = (__u64 *)task_pt_regs(child);
+        switch (regno) {
+        case offsetof(struct user32, regs.fs):
+                if (val && (val & 3) != 3) return -EIO; 
+                child->thread.fsindex = val & 0xffff;
+                break;
+        case offsetof(struct user32, regs.gs):
+                if (val && (val & 3) != 3) return -EIO; 
+                child->thread.gsindex = val & 0xffff;
+                break;
+        case offsetof(struct user32, regs.ds):
+                if (val && (val & 3) != 3) return -EIO; 
+                child->thread.ds = val & 0xffff;
+                break;
+        case offsetof(struct user32, regs.es):
+                child->thread.es = val & 0xffff;
+                break;
+        case offsetof(struct user32, regs.ss): 
+                if ((val & 3) != 3) return -EIO;
+                stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff;
+                break;
+        case offsetof(struct user32, regs.cs): 
+                if ((val & 3) != 3) return -EIO;
+                stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff;
+                break;
+        R32(ebx, rbx); 
+        R32(ecx, rcx);
+        R32(edx, rdx);
+        R32(edi, rdi);
+        R32(esi, rsi);
+        R32(ebp, rbp);
+        R32(eax, rax);
+        R32(orig_eax, orig_rax);
+        R32(eip, rip);
+        R32(esp, rsp);
+        case offsetof(struct user32, regs.eflags): {
+                __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8];
+                val &= FLAG_MASK;
+                *flags = val | (*flags & ~FLAG_MASK);
+                break;
+        }
+        case offsetof(struct user32, u_debugreg[4]): 
+        case offsetof(struct user32, u_debugreg[5]):
+                return -EIO;
+        case offsetof(struct user32, u_debugreg[0]):
+                child->thread.debugreg0 = val;
+                break;
+        case offsetof(struct user32, u_debugreg[1]):
+                child->thread.debugreg1 = val;
+                break;
+        case offsetof(struct user32, u_debugreg[2]):
+                child->thread.debugreg2 = val;
+                break;
+        case offsetof(struct user32, u_debugreg[3]):
+                child->thread.debugreg3 = val;
+                break;
+        case offsetof(struct user32, u_debugreg[6]):
+                child->thread.debugreg6 = val;
+                break; 
+        case offsetof(struct user32, u_debugreg[7]):
+                val &= ~DR_CONTROL_RESERVED;
+                /* See arch/i386/kernel/ptrace.c for an explanation of
+                 * this awkward check.*/
+                for(i=0; i<4; i++)
+                        if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1)
+                               return -EIO;
+                child->thread.debugreg7 = val; 
+                if (val)
+                        set_tsk_thread_flag(child, TIF_DEBUG);
+                else
+                        clear_tsk_thread_flag(child, TIF_DEBUG);
+                break; 
+                    
+        default:
+                if (regno > sizeof(struct user32) || (regno & 3))
+                        return -EIO;
+               
+                /* Other dummy fields in the virtual user structure are ignored */ 
+                break;          
+        }
+        return 0;
+}
+#undef R32
+#define R32(l,q) \
+        case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break
+static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
+{
+        __u64 *stack = (__u64 *)task_pt_regs(child);
+        switch (regno) {
+        case offsetof(struct user32, regs.fs):
+                *val = child->thread.fsindex;
+                break;
+        case offsetof(struct user32, regs.gs):
+                *val = child->thread.gsindex;
+                break;
+        case offsetof(struct user32, regs.ds):
+                *val = child->thread.ds;
+                break;
+        case offsetof(struct user32, regs.es):
+                *val = child->thread.es;
+                break;
+        R32(cs, cs);
+        R32(ss, ss);
+        R32(ebx, rbx); 
+        R32(ecx, rcx);
+        R32(edx, rdx);
+        R32(edi, rdi);
+        R32(esi, rsi);
+        R32(ebp, rbp);
+        R32(eax, rax);
+        R32(orig_eax, orig_rax);
+        R32(eip, rip);
+        R32(eflags, eflags);
+        R32(esp, rsp);
+        case offsetof(struct user32, u_debugreg[0]): 
+                *val = child->thread.debugreg0; 
+                break; 
+        case offsetof(struct user32, u_debugreg[1]): 
+                *val = child->thread.debugreg1; 
+                break; 
+        case offsetof(struct user32, u_debugreg[2]): 
+                *val = child->thread.debugreg2; 
+                break; 
+        case offsetof(struct user32, u_debugreg[3]): 
+                *val = child->thread.debugreg3; 
+                break; 
+        case offsetof(struct user32, u_debugreg[6]): 
+                *val = child->thread.debugreg6; 
+                break; 
+        case offsetof(struct user32, u_debugreg[7]): 
+                *val = child->thread.debugreg7; 
+                break; 
+                    
+        default:
+                if (regno > sizeof(struct user32) || (regno & 3))
+                        return -EIO;
+                /* Other dummy fields in the virtual user structure are ignored */ 
+                *val = 0;
+                break;          
+        }
+        return 0;
+}
+#undef R32
+static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
+{
+        int ret;
+        compat_siginfo_t __user *si32 = compat_ptr(data);
+        siginfo_t ssi; 
+        siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
+        if (request == PTRACE_SETSIGINFO) {
+                memset(&ssi, 0, sizeof(siginfo_t));
+                ret = copy_siginfo_from_user32(&ssi, si32);
+                if (ret)
+                        return ret;
+                if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
+                        return -EFAULT;
+        }
+        ret = sys_ptrace(request, pid, addr, (unsigned long)si);
+        if (ret)
+                return ret;
+        if (request == PTRACE_GETSIGINFO) {
+                if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
+                        return -EFAULT;
+                ret = copy_siginfo_to_user32(si32, &ssi);
+        }
+        return ret;
+}
+asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
+{
+        struct task_struct *child;
+        struct pt_regs *childregs; 
+        void __user *datap = compat_ptr(data);
+        int ret;
+        __u32 val;
+        switch (request) { 
+        case PTRACE_TRACEME:
+        case PTRACE_ATTACH:
+        case PTRACE_KILL:
+        case PTRACE_CONT:
+        case PTRACE_SINGLESTEP:
+        case PTRACE_DETACH:
+        case PTRACE_SYSCALL:
+        case PTRACE_OLDSETOPTIONS:
+        case PTRACE_SETOPTIONS:
+        case PTRACE_SET_THREAD_AREA:
+        case PTRACE_GET_THREAD_AREA:
+                return sys_ptrace(request, pid, addr, data); 
+        default:
+                return -EINVAL;
+        case PTRACE_PEEKTEXT:
+        case PTRACE_PEEKDATA:
+        case PTRACE_POKEDATA:
+        case PTRACE_POKETEXT:
+        case PTRACE_POKEUSR:       
+        case PTRACE_PEEKUSR:
+        case PTRACE_GETREGS:
+        case PTRACE_SETREGS:
+        case PTRACE_SETFPREGS:
+        case PTRACE_GETFPREGS:
+        case PTRACE_SETFPXREGS:
+        case PTRACE_GETFPXREGS:
+        case PTRACE_GETEVENTMSG:
+                break;
+        case PTRACE_SETSIGINFO:
+        case PTRACE_GETSIGINFO:
+                return ptrace32_siginfo(request, pid, addr, data);
+        }
+        child = ptrace_get_task_struct(pid);
+        if (IS_ERR(child))
+                return PTR_ERR(child);
+        ret = ptrace_check_attach(child, request == PTRACE_KILL);
+        if (ret < 0)
+                goto out;
+        childregs = task_pt_regs(child);
+        switch (request) {
+        case PTRACE_PEEKDATA:
+        case PTRACE_PEEKTEXT:
+                ret = 0;
+                if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32))
+                        ret = -EIO;
+                else
+                        ret = put_user(val, (unsigned int __user *)datap); 
+                break; 
+        case PTRACE_POKEDATA:
+        case PTRACE_POKETEXT:
+                ret = 0;
+                if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32))
+                        ret = -EIO; 
+                break;
+        case PTRACE_PEEKUSR:
+                ret = getreg32(child, addr, &val);
+                if (ret == 0)
+                        ret = put_user(val, (__u32 __user *)datap);
+                break;
+        case PTRACE_POKEUSR:
+                ret = putreg32(child, addr, data);
+                break;
+        case PTRACE_GETREGS: { /* Get all gp regs from the child. */
+                int i;
+                if (!access_ok(VERIFY_WRITE, datap, 16*4)) {
+                        ret = -EIO;
+                        break;
+                }
+                ret = 0;
+                for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) {
+                        getreg32(child, i, &val);
+                        ret |= __put_user(val,(u32 __user *)datap);
+                        datap += sizeof(u32);
+                }
+                break;
+        }
+        case PTRACE_SETREGS: { /* Set all gp regs in the child. */
+                unsigned long tmp;
+                int i;
+                if (!access_ok(VERIFY_READ, datap, 16*4)) {
+                        ret = -EIO;
+                        break;
+                }
+                ret = 0; 
+                for ( i = 0; i <= 16*4; i += sizeof(u32) ) {
+                        ret |= __get_user(tmp, (u32 __user *)datap);
+                        putreg32(child, i, tmp);
+                        datap += sizeof(u32);
+                }
+                break;
+        }
+        case PTRACE_GETFPREGS:
+                ret = -EIO; 
+                if (!access_ok(VERIFY_READ, compat_ptr(data), 
+                               sizeof(struct user_i387_struct)))
+                        break;
+                save_i387_ia32(child, datap, childregs, 1);
+                ret = 0; 
+                        break;
+        case PTRACE_SETFPREGS:
+                ret = -EIO;
+                if (!access_ok(VERIFY_WRITE, datap, 
+                               sizeof(struct user_i387_struct)))
+                        break;
+                ret = 0;
+                /* don't check EFAULT to be bug-to-bug compatible to i386 */
+                restore_i387_ia32(child, datap, 1);
+                break;
+        case PTRACE_GETFPXREGS: { 
+                struct user32_fxsr_struct __user *u = datap;
+                init_fpu(child); 
+                ret = -EIO;
+                if (!access_ok(VERIFY_WRITE, u, sizeof(*u)))
+                        break;
+                        ret = -EFAULT;
+                if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u)))
+                        break;
+                ret = __put_user(childregs->cs, &u->fcs);
+                ret |= __put_user(child->thread.ds, &u->fos); 
+                break; 
+        } 
+        case PTRACE_SETFPXREGS: { 
+                struct user32_fxsr_struct __user *u = datap;
+                unlazy_fpu(child);
+                ret = -EIO;
+                if (!access_ok(VERIFY_READ, u, sizeof(*u)))
+                        break;
+                /* no checking to be bug-to-bug compatible with i386. */
+                /* but silence warning */
+                if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)))
+                        ;
+                set_stopped_child_used_math(child);
+                child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+                ret = 0; 
+                break;
+        }
+        case PTRACE_GETEVENTMSG:
+                ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data));
+                break;
+        default:
+                BUG();
+        }
+ out:
+        put_task_struct(child);
+        return ret;
+}
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
new file mode 100644
index 000000000000..bee96d614432
--- /dev/null
+++ b/arch/x86/ia32/sys_ia32.c
@@ -0,0 +1,889 @@
+/*
+ * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on
+ *             sys_sparc32 
+ *
+ * Copyright (C) 2000           VA Linux Co
+ * Copyright (C) 2000           Don Dugger <n0ano@valinux.com>
+ * Copyright (C) 1999           Arun Sharma <arun.sharma@intel.com>
+ * Copyright (C) 1997,1998      Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997           David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 2000           Hewlett-Packard Co.
+ * Copyright (C) 2000           David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) 
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * environment. In 2.5 most of this should be moved to a generic directory. 
+ *
+ * This file assumes that there is a hole at the end of user address space.
+ * 
+ * Some of the functions are LE specific currently. These are hopefully all marked.
+ * This should be fixed.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h> 
+#include <linux/file.h> 
+#include <linux/signal.h>
+#include <linux/syscalls.h>
+#include <linux/resource.h>
+#include <linux/times.h>
+#include <linux/utsname.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/mm.h>
+#include <linux/shm.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/nfs_fs.h>
+#include <linux/quota.h>
+#include <linux/module.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/nfsd/nfsd.h>
+#include <linux/nfsd/cache.h>
+#include <linux/nfsd/xdr.h>
+#include <linux/nfsd/syscall.h>
+#include <linux/poll.h>
+#include <linux/personality.h>
+#include <linux/stat.h>
+#include <linux/ipc.h>
+#include <linux/rwsem.h>
+#include <linux/binfmts.h>
+#include <linux/init.h>
+#include <linux/aio_abi.h>
+#include <linux/aio.h>
+#include <linux/compat.h>
+#include <linux/vfs.h>
+#include <linux/ptrace.h>
+#include <linux/highuid.h>
+#include <linux/vmalloc.h>
+#include <linux/fsnotify.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/types.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <asm/atomic.h>
+#include <asm/ldt.h>
+#include <net/scm.h>
+#include <net/sock.h>
+#include <asm/ia32.h>
+#define AA(__x)         ((unsigned long)(__x))
+int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf)
+{
+        compat_ino_t ino;
+        typeof(ubuf->st_uid) uid = 0;
+        typeof(ubuf->st_gid) gid = 0;
+        SET_UID(uid, kbuf->uid);
+        SET_GID(gid, kbuf->gid);
+        if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev))
+                return -EOVERFLOW;
+        if (kbuf->size >= 0x7fffffff)
+                return -EOVERFLOW;
+        ino = kbuf->ino;
+        if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino)
+                return -EOVERFLOW;
+        if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) ||
+            __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
+            __put_user (ino, &ubuf->st_ino) ||
+            __put_user (kbuf->mode, &ubuf->st_mode) ||
+            __put_user (kbuf->nlink, &ubuf->st_nlink) ||
+            __put_user (uid, &ubuf->st_uid) ||
+            __put_user (gid, &ubuf->st_gid) ||
+            __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
+            __put_user (kbuf->size, &ubuf->st_size) ||
+            __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) ||
+            __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
+            __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
+            __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
+            __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
+            __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
+            __put_user (kbuf->blksize, &ubuf->st_blksize) ||
+            __put_user (kbuf->blocks, &ubuf->st_blocks))
+                return -EFAULT;
+        return 0;
+}
+asmlinkage long
+sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high)
+{
+       return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low);
+}
+asmlinkage long
+sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high)
+{
+       return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
+}
+/* Another set for IA32/LFS -- x86_64 struct stat is different due to 
+   support for 64bit inode numbers. */
+static int
+cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
+{
+        typeof(ubuf->st_uid) uid = 0;
+        typeof(ubuf->st_gid) gid = 0;
+        SET_UID(uid, stat->uid);
+        SET_GID(gid, stat->gid);
+        if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) ||
+            __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
+            __put_user (stat->ino, &ubuf->__st_ino) ||
+            __put_user (stat->ino, &ubuf->st_ino) ||
+            __put_user (stat->mode, &ubuf->st_mode) ||
+            __put_user (stat->nlink, &ubuf->st_nlink) ||
+            __put_user (uid, &ubuf->st_uid) ||
+            __put_user (gid, &ubuf->st_gid) ||
+            __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) ||
+            __put_user (stat->size, &ubuf->st_size) ||
+            __put_user (stat->atime.tv_sec, &ubuf->st_atime) ||
+            __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) ||
+            __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) ||
+            __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
+            __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) ||
+            __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
+            __put_user (stat->blksize, &ubuf->st_blksize) ||
+            __put_user (stat->blocks, &ubuf->st_blocks))
+                return -EFAULT;
+        return 0;
+}
+asmlinkage long
+sys32_stat64(char __user * filename, struct stat64 __user *statbuf)
+{
+        struct kstat stat;
+        int ret = vfs_stat(filename, &stat);
+        if (!ret)
+                ret = cp_stat64(statbuf, &stat);
+        return ret;
+}
+asmlinkage long
+sys32_lstat64(char __user * filename, struct stat64 __user *statbuf)
+{
+        struct kstat stat;
+        int ret = vfs_lstat(filename, &stat);
+        if (!ret)
+                ret = cp_stat64(statbuf, &stat);
+        return ret;
+}
+asmlinkage long
+sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
+{
+        struct kstat stat;
+        int ret = vfs_fstat(fd, &stat);
+        if (!ret)
+                ret = cp_stat64(statbuf, &stat);
+        return ret;
+}
+asmlinkage long
+sys32_fstatat(unsigned int dfd, char __user *filename,
+              struct stat64 __user* statbuf, int flag)
+{
+        struct kstat stat;
+        int error = -EINVAL;
+        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+                goto out;
+        if (flag & AT_SYMLINK_NOFOLLOW)
+                error = vfs_lstat_fd(dfd, filename, &stat);
+        else
+                error = vfs_stat_fd(dfd, filename, &stat);
+        if (!error)
+                error = cp_stat64(statbuf, &stat);
+out:
+        return error;
+}
+/*
+ * Linux/i386 didn't use to be able to handle more than
+ * 4 system call parameters, so these system calls used a memory
+ * block for parameter passing..
+ */
+struct mmap_arg_struct {
+        unsigned int addr;
+        unsigned int len;
+        unsigned int prot;
+        unsigned int flags;
+        unsigned int fd;
+        unsigned int offset;
+};
+asmlinkage long
+sys32_mmap(struct mmap_arg_struct __user *arg)
+{
+        struct mmap_arg_struct a;
+        struct file *file = NULL;
+        unsigned long retval;
+        struct mm_struct *mm ;
+        if (copy_from_user(&a, arg, sizeof(a)))
+                return -EFAULT;
+        if (a.offset & ~PAGE_MASK)
+                return -EINVAL; 
+        if (!(a.flags & MAP_ANONYMOUS)) {
+                file = fget(a.fd);
+                if (!file)
+                        return -EBADF;
+        }
+        
+        mm = current->mm; 
+        down_write(&mm->mmap_sem); 
+        retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT);
+        if (file)
+                fput(file);
+        up_write(&mm->mmap_sem); 
+        return retval;
+}
+asmlinkage long 
+sys32_mprotect(unsigned long start, size_t len, unsigned long prot)
+{
+        return sys_mprotect(start,len,prot); 
+}
+asmlinkage long
+sys32_pipe(int __user *fd)
+{
+        int retval;
+        int fds[2];
+        retval = do_pipe(fds);
+        if (retval)
+                goto out;
+        if (copy_to_user(fd, fds, sizeof(fds)))
+                retval = -EFAULT;
+  out:
+        return retval;
+}
+asmlinkage long
+sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
+                   struct sigaction32 __user *oact,  unsigned int sigsetsize)
+{
+        struct k_sigaction new_ka, old_ka;
+        int ret;
+        compat_sigset_t set32;
+        /* XXX: Don't preclude handling different sized sigset_t's.  */
+        if (sigsetsize != sizeof(compat_sigset_t))
+                return -EINVAL;
+        if (act) {
+                compat_uptr_t handler, restorer;
+                if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+                    __get_user(handler, &act->sa_handler) ||
+                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+                    __get_user(restorer, &act->sa_restorer)||
+                    __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t)))
+                        return -EFAULT;
+                new_ka.sa.sa_handler = compat_ptr(handler);
+                new_ka.sa.sa_restorer = compat_ptr(restorer);
+                /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */
+                switch (_NSIG_WORDS) {
+                case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6]
+                                | (((long)set32.sig[7]) << 32);
+                case 3: new_ka.sa.sa_mask.sig[2] = set32.sig[4]
+                                | (((long)set32.sig[5]) << 32);
+                case 2: new_ka.sa.sa_mask.sig[1] = set32.sig[2]
+                                | (((long)set32.sig[3]) << 32);
+                case 1: new_ka.sa.sa_mask.sig[0] = set32.sig[0]
+                                | (((long)set32.sig[1]) << 32);
+                }
+        }
+        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+        if (!ret && oact) {
+                /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */
+                switch (_NSIG_WORDS) {
+                case 4:
+                        set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32);
+                        set32.sig[6] = old_ka.sa.sa_mask.sig[3];
+                case 3:
+                        set32.sig[5] = (old_ka.sa.sa_mask.sig[2] >> 32);
+                        set32.sig[4] = old_ka.sa.sa_mask.sig[2];
+                case 2:
+                        set32.sig[3] = (old_ka.sa.sa_mask.sig[1] >> 32);
+                        set32.sig[2] = old_ka.sa.sa_mask.sig[1];
+                case 1:
+                        set32.sig[1] = (old_ka.sa.sa_mask.sig[0] >> 32);
+                        set32.sig[0] = old_ka.sa.sa_mask.sig[0];
+                }
+                if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+                    __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) ||
+                    __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) ||
+                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+                    __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t)))
+                        return -EFAULT;
+        }
+        return ret;
+}
+asmlinkage long
+sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact)
+{
+        struct k_sigaction new_ka, old_ka;
+        int ret;
+        if (act) {
+                compat_old_sigset_t mask;
+                compat_uptr_t handler, restorer;
+                if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+                    __get_user(handler, &act->sa_handler) ||
+                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+                    __get_user(restorer, &act->sa_restorer) ||
+                    __get_user(mask, &act->sa_mask))
+                        return -EFAULT;
+                new_ka.sa.sa_handler = compat_ptr(handler);
+                new_ka.sa.sa_restorer = compat_ptr(restorer);
+                siginitset(&new_ka.sa.sa_mask, mask);
+        }
+        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+        if (!ret && oact) {
+                if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+                    __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) ||
+                    __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) ||
+                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+                        return -EFAULT;
+        }
+        return ret;
+}
+asmlinkage long
+sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
+                        compat_sigset_t __user *oset, unsigned int sigsetsize)
+{
+        sigset_t s;
+        compat_sigset_t s32;
+        int ret;
+        mm_segment_t old_fs = get_fs();
+        
+        if (set) {
+                if (copy_from_user (&s32, set, sizeof(compat_sigset_t)))
+                        return -EFAULT;
+                switch (_NSIG_WORDS) {
+                case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32);
+                case 3: s.sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32);
+                case 2: s.sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32);
+                case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32);
+                }
+        }
+        set_fs (KERNEL_DS);
+        ret = sys_rt_sigprocmask(how,
+                                 set ? (sigset_t __user *)&s : NULL,
+                                 oset ? (sigset_t __user *)&s : NULL,
+                                 sigsetsize); 
+        set_fs (old_fs);
+        if (ret) return ret;
+        if (oset) {
+                switch (_NSIG_WORDS) {
+                case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
+                case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
+                case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
+                case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
+                }
+                if (copy_to_user (oset, &s32, sizeof(compat_sigset_t)))
+                        return -EFAULT;
+        }
+        return 0;
+}
+static inline long
+get_tv32(struct timeval *o, struct compat_timeval __user *i)
+{
+        int err = -EFAULT; 
+        if (access_ok(VERIFY_READ, i, sizeof(*i))) { 
+                err = __get_user(o->tv_sec, &i->tv_sec);
+                err |= __get_user(o->tv_usec, &i->tv_usec);
+        }
+        return err; 
+}
+static inline long
+put_tv32(struct compat_timeval __user *o, struct timeval *i)
+{
+        int err = -EFAULT;
+        if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { 
+                err = __put_user(i->tv_sec, &o->tv_sec);
+                err |= __put_user(i->tv_usec, &o->tv_usec);
+        } 
+        return err; 
+}
+extern unsigned int alarm_setitimer(unsigned int seconds);
+asmlinkage long
+sys32_alarm(unsigned int seconds)
+{
+        return alarm_setitimer(seconds);
+}
+/* Translations due to time_t size differences.  Which affects all
+   sorts of things, like timeval and itimerval.  */
+extern struct timezone sys_tz;
+asmlinkage long
+sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
+{
+        if (tv) {
+                struct timeval ktv;
+                do_gettimeofday(&ktv);
+                if (put_tv32(tv, &ktv))
+                        return -EFAULT;
+        }
+        if (tz) {
+                if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
+                        return -EFAULT;
+        }
+        return 0;
+}
+asmlinkage long
+sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
+{
+        struct timeval ktv;
+        struct timespec kts;
+        struct timezone ktz;
+        if (tv) {
+                if (get_tv32(&ktv, tv))
+                        return -EFAULT;
+                kts.tv_sec = ktv.tv_sec;
+                kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC;
+        }
+        if (tz) {
+                if (copy_from_user(&ktz, tz, sizeof(ktz)))
+                        return -EFAULT;
+        }
+        return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+}
+struct sel_arg_struct {
+        unsigned int n;
+        unsigned int inp;
+        unsigned int outp;
+        unsigned int exp;
+        unsigned int tvp;
+};
+asmlinkage long
+sys32_old_select(struct sel_arg_struct __user *arg)
+{
+        struct sel_arg_struct a;
+        if (copy_from_user(&a, arg, sizeof(a)))
+                return -EFAULT;
+        return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+                                 compat_ptr(a.exp), compat_ptr(a.tvp));
+}
+extern asmlinkage long
+compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options,
+                 struct compat_rusage *ru);
+asmlinkage long
+sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options)
+{
+        return compat_sys_wait4(pid, stat_addr, options, NULL);
+}
+/* 32-bit timeval and related flotsam.  */
+asmlinkage long
+sys32_sysfs(int option, u32 arg1, u32 arg2)
+{
+        return sys_sysfs(option, arg1, arg2);
+}
+asmlinkage long
+sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval)
+{
+        struct timespec t;
+        int ret;
+        mm_segment_t old_fs = get_fs ();
+        
+        set_fs (KERNEL_DS);
+        ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
+        set_fs (old_fs);
+        if (put_compat_timespec(&t, interval))
+                return -EFAULT;
+        return ret;
+}
+asmlinkage long
+sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize)
+{
+        sigset_t s;
+        compat_sigset_t s32;
+        int ret;
+        mm_segment_t old_fs = get_fs();
+                
+        set_fs (KERNEL_DS);
+        ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize);
+        set_fs (old_fs);
+        if (!ret) {
+                switch (_NSIG_WORDS) {
+                case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
+                case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
+                case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
+                case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
+                }
+                if (copy_to_user (set, &s32, sizeof(compat_sigset_t)))
+                        return -EFAULT;
+        }
+        return ret;
+}
+asmlinkage long
+sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo)
+{
+        siginfo_t info;
+        int ret;
+        mm_segment_t old_fs = get_fs();
+        
+        if (copy_siginfo_from_user32(&info, uinfo))
+                return -EFAULT;
+        set_fs (KERNEL_DS);
+        ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info);
+        set_fs (old_fs);
+        return ret;
+}
+/* These are here just in case some old ia32 binary calls it. */
+asmlinkage long
+sys32_pause(void)
+{
+        current->state = TASK_INTERRUPTIBLE;
+        schedule();
+        return -ERESTARTNOHAND;
+}
+#ifdef CONFIG_SYSCTL_SYSCALL
+struct sysctl_ia32 {
+        unsigned int    name;
+        int             nlen;
+        unsigned int    oldval;
+        unsigned int    oldlenp;
+        unsigned int    newval;
+        unsigned int    newlen;
+        unsigned int    __unused[4];
+};
+asmlinkage long
+sys32_sysctl(struct sysctl_ia32 __user *args32)
+{
+        struct sysctl_ia32 a32;
+        mm_segment_t old_fs = get_fs ();
+        void __user *oldvalp, *newvalp;
+        size_t oldlen;
+        int __user *namep;
+        long ret;
+        if (copy_from_user(&a32, args32, sizeof (a32)))
+                return -EFAULT;
+        /*
+         * We need to pre-validate these because we have to disable address checking
+         * before calling do_sysctl() because of OLDLEN but we can't run the risk of the
+         * user specifying bad addresses here.  Well, since we're dealing with 32 bit
+         * addresses, we KNOW that access_ok() will always succeed, so this is an
+         * expensive NOP, but so what...
+         */
+        namep = compat_ptr(a32.name);
+        oldvalp = compat_ptr(a32.oldval);
+        newvalp =  compat_ptr(a32.newval);
+        if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
+            || !access_ok(VERIFY_WRITE, namep, 0)
+            || !access_ok(VERIFY_WRITE, oldvalp, 0)
+            || !access_ok(VERIFY_WRITE, newvalp, 0))
+                return -EFAULT;
+        set_fs(KERNEL_DS);
+        lock_kernel();
+        ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen,
+                        newvalp, (size_t) a32.newlen);
+        unlock_kernel();
+        set_fs(old_fs);
+        if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp)))
+                return -EFAULT;
+        return ret;
+}
+#endif
+/* warning: next two assume little endian */ 
+asmlinkage long
+sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi)
+{
+        return sys_pread64(fd, ubuf, count,
+                         ((loff_t)AA(poshi) << 32) | AA(poslo));
+}
+asmlinkage long
+sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi)
+{
+        return sys_pwrite64(fd, ubuf, count,
+                          ((loff_t)AA(poshi) << 32) | AA(poslo));
+}
+asmlinkage long
+sys32_personality(unsigned long personality)
+{
+        int ret;
+        if (personality(current->personality) == PER_LINUX32 && 
+                personality == PER_LINUX)
+                personality = PER_LINUX32;
+        ret = sys_personality(personality);
+        if (ret == PER_LINUX32)
+                ret = PER_LINUX;
+        return ret;
+}
+asmlinkage long
+sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count)
+{
+        mm_segment_t old_fs = get_fs();
+        int ret;
+        off_t of;
+        
+        if (offset && get_user(of, offset))
+                return -EFAULT;
+                
+        set_fs(KERNEL_DS);
+        ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL,
+                           count);
+        set_fs(old_fs);
+        
+        if (offset && put_user(of, offset))
+                return -EFAULT;
+                
+        return ret;
+}
+asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
+        unsigned long prot, unsigned long flags,
+        unsigned long fd, unsigned long pgoff)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long error;
+        struct file * file = NULL;
+        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+        if (!(flags & MAP_ANONYMOUS)) {
+                file = fget(fd);
+                if (!file)
+                        return -EBADF;
+        }
+        down_write(&mm->mmap_sem);
+        error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+        up_write(&mm->mmap_sem);
+        if (file)
+                fput(file);
+        return error;
+}
+asmlinkage long sys32_olduname(struct oldold_utsname __user * name)
+{
+        int err;
+        if (!name)
+                return -EFAULT;
+        if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
+                return -EFAULT;
+  
+        down_read(&uts_sem);
+        err = __copy_to_user(&name->sysname,&utsname()->sysname,
+                                __OLD_UTS_LEN);
+        err |= __put_user(0,name->sysname+__OLD_UTS_LEN);
+        err |= __copy_to_user(&name->nodename,&utsname()->nodename,
+                                __OLD_UTS_LEN);
+        err |= __put_user(0,name->nodename+__OLD_UTS_LEN);
+        err |= __copy_to_user(&name->release,&utsname()->release,
+                                __OLD_UTS_LEN);
+        err |= __put_user(0,name->release+__OLD_UTS_LEN);
+        err |= __copy_to_user(&name->version,&utsname()->version,
+                                __OLD_UTS_LEN);
+        err |= __put_user(0,name->version+__OLD_UTS_LEN);
+        {
+                char *arch = "x86_64";
+                if (personality(current->personality) == PER_LINUX32)
+                        arch = "i686";
+                 
+                err |= __copy_to_user(&name->machine, arch, strlen(arch)+1);
+        }
+        up_read(&uts_sem);
+        err = err ? -EFAULT : 0;
+        return err;
+}
+long sys32_uname(struct old_utsname __user * name)
+{
+        int err;
+        if (!name)
+                return -EFAULT;
+        down_read(&uts_sem);
+        err = copy_to_user(name, utsname(), sizeof (*name));
+        up_read(&uts_sem);
+        if (personality(current->personality) == PER_LINUX32) 
+                err |= copy_to_user(&name->machine, "i686", 5);
+        return err?-EFAULT:0;
+}
+long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
+{
+        struct ustat u;
+        mm_segment_t seg;
+        int ret;
+        
+        seg = get_fs(); 
+        set_fs(KERNEL_DS); 
+        ret = sys_ustat(dev, (struct ustat __user *)&u);
+        set_fs(seg);
+        if (ret >= 0) { 
+                if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || 
+                    __put_user((__u32) u.f_tfree, &u32p->f_tfree) ||
+                    __put_user((__u32) u.f_tinode, &u32p->f_tfree) ||
+                    __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) ||
+                    __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack)))
+                        ret = -EFAULT;
+        }
+        return ret;
+} 
+asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
+                             compat_uptr_t __user *envp, struct pt_regs *regs)
+{
+        long error;
+        char * filename;
+        filename = getname(name);
+        error = PTR_ERR(filename);
+        if (IS_ERR(filename))
+                return error;
+        error = compat_do_execve(filename, argv, envp, regs);
+        if (error == 0) {
+                task_lock(current);
+                current->ptrace &= ~PT_DTRACE;
+                task_unlock(current);
+        }
+        putname(filename);
+        return error;
+}
+asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp,
+                            struct pt_regs *regs)
+{
+        void __user *parent_tid = (void __user *)regs->rdx;
+        void __user *child_tid = (void __user *)regs->rdi;
+        if (!newsp)
+                newsp = regs->rsp;
+        return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+}
+/*
+ * Some system calls that need sign extended arguments. This could be done by a generic wrapper.
+ */ 
+long sys32_lseek (unsigned int fd, int offset, unsigned int whence)
+{
+        return sys_lseek(fd, offset, whence);
+}
+long sys32_kill(int pid, int sig)
+{
+        return sys_kill(pid, sig);
+}
+ 
+long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, 
+                        __u32 len_low, __u32 len_high, int advice)
+{ 
+        return sys_fadvise64_64(fd,
+                               (((u64)offset_high)<<32) | offset_low,
+                               (((u64)len_high)<<32) | len_low,
+                               advice); 
+} 
+long sys32_vm86_warning(void)
+{ 
+        struct task_struct *me = current;
+        static char lastcomm[sizeof(me->comm)];
+        if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
+                compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",
+                       me->comm);
+                strncpy(lastcomm, me->comm, sizeof(lastcomm));
+        } 
+        return -ENOSYS;
+} 
+long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
+                          char __user * buf, size_t len)
+{
+        return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len);
+}
+asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, size_t count)
+{
+        return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
+}
+asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi,
+                           unsigned n_low, unsigned n_hi,  int flags)
+{
+        return sys_sync_file_range(fd,
+                                   ((u64)off_hi << 32) | off_low,
+                                   ((u64)n_hi << 32) | n_low, flags);
+}
+asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, size_t len,
+                     int advice)
+{
+        return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
+                                len, advice);
+}
+asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
+                                unsigned offset_hi, unsigned len_lo,
+                                unsigned len_hi)
+{
+        return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
+                             ((u64)len_hi << 32) | len_lo);
+}
diff --git a/arch/x86/ia32/syscall32.c b/arch/x86/ia32/syscall32.c
new file mode 100644
index 000000000000..15013bac181c
--- /dev/null
+++ b/arch/x86/ia32/syscall32.c
@@ -0,0 +1,83 @@
+/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
+/* vsyscall handling for 32bit processes. Map a stub page into it 
+   on demand because 32bit cannot reach the kernel's fixmaps */
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/stringify.h>
+#include <linux/security.h>
+#include <asm/proto.h>
+#include <asm/tlbflush.h>
+#include <asm/ia32_unistd.h>
+#include <asm/vsyscall32.h>
+extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
+extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
+extern int sysctl_vsyscall32;
+static struct page *syscall32_pages[1];
+static int use_sysenter = -1;
+struct linux_binprm;
+/* Setup a VMA at program startup for the vsyscall page */
+int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
+{
+        struct mm_struct *mm = current->mm;
+        int ret;
+        down_write(&mm->mmap_sem);
+        /*
+         * MAYWRITE to allow gdb to COW and set breakpoints
+         *
+         * Make sure the vDSO gets into every core dump.
+         * Dumping its contents makes post-mortem fully interpretable later
+         * without matching up the same kernel and hardware config to see
+         * what PC values meant.
+         */
+        /* Could randomize here */
+        ret = install_special_mapping(mm, VSYSCALL32_BASE, PAGE_SIZE,
+                                      VM_READ|VM_EXEC|
+                                      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+                                      VM_ALWAYSDUMP,
+                                      syscall32_pages);
+        up_write(&mm->mmap_sem);
+        return ret;
+}
+static int __init init_syscall32(void)
+{ 
+        char *syscall32_page = (void *)get_zeroed_page(GFP_KERNEL);
+        if (!syscall32_page) 
+                panic("Cannot allocate syscall32 page"); 
+        syscall32_pages[0] = virt_to_page(syscall32_page);
+        if (use_sysenter > 0) {
+                memcpy(syscall32_page, syscall32_sysenter,
+                       syscall32_sysenter_end - syscall32_sysenter);
+        } else {
+                memcpy(syscall32_page, syscall32_syscall,
+                       syscall32_syscall_end - syscall32_syscall);
+        }       
+        return 0;
+} 
+        
+__initcall(init_syscall32); 
+/* May not be __init: called during resume */
+void syscall32_cpu_init(void)
+{
+        if (use_sysenter < 0)
+                use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
+        /* Load these always in case some future AMD CPU supports
+           SYSENTER from compat mode too. */
+        checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+        checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
+        checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+        wrmsrl(MSR_CSTAR, ia32_cstar_target);
+}
diff --git a/arch/x86/ia32/syscall32_syscall.S b/arch/x86/ia32/syscall32_syscall.S
new file mode 100644
index 000000000000..933f0f08b1cf
--- /dev/null
+++ b/arch/x86/ia32/syscall32_syscall.S
@@ -0,0 +1,17 @@
+/* 32bit VDSOs mapped into user space. */
+        .section ".init.data","aw"
+        .globl syscall32_syscall
+        .globl syscall32_syscall_end
+syscall32_syscall:
+        .incbin "arch/x86/ia32/vsyscall-syscall.so"
+syscall32_syscall_end:
+        .globl syscall32_sysenter
+        .globl syscall32_sysenter_end
+syscall32_sysenter:
+        .incbin "arch/x86/ia32/vsyscall-sysenter.so"
+syscall32_sysenter_end:
diff --git a/arch/x86/ia32/tls32.c b/arch/x86/ia32/tls32.c
new file mode 100644
index 000000000000..1cc4340de3ca
--- /dev/null
+++ b/arch/x86/ia32/tls32.c
@@ -0,0 +1,163 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/user.h>
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/proto.h>
+/*
+ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
+ */
+static int get_free_idx(void)
+{
+        struct thread_struct *t = &current->thread;
+        int idx;
+        for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
+                if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx))
+                        return idx + GDT_ENTRY_TLS_MIN;
+        return -ESRCH;
+}
+/*
+ * Set a given TLS descriptor:
+ * When you want addresses > 32bit use arch_prctl() 
+ */
+int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
+{
+        struct user_desc info;
+        struct n_desc_struct *desc;
+        int cpu, idx;
+        if (copy_from_user(&info, u_info, sizeof(info)))
+                return -EFAULT;
+        idx = info.entry_number;
+        /*
+         * index -1 means the kernel should try to find and
+         * allocate an empty descriptor:
+         */
+        if (idx == -1) {
+                idx = get_free_idx();
+                if (idx < 0)
+                        return idx;
+                if (put_user(idx, &u_info->entry_number))
+                        return -EFAULT;
+        }
+        if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+                return -EINVAL;
+        desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
+        /*
+         * We must not get preempted while modifying the TLS.
+         */
+        cpu = get_cpu();
+        if (LDT_empty(&info)) {
+                desc->a = 0;
+                desc->b = 0;
+        } else {
+                desc->a = LDT_entry_a(&info);
+                desc->b = LDT_entry_b(&info);
+        }
+        if (t == &current->thread)
+                load_TLS(t, cpu);
+        put_cpu();
+        return 0;
+}
+asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info)
+{ 
+        return do_set_thread_area(&current->thread, u_info); 
+} 
+/*
+ * Get the current Thread-Local Storage area:
+ */
+#define GET_BASE(desc) ( \
+        (((desc)->a >> 16) & 0x0000ffff) | \
+        (((desc)->b << 16) & 0x00ff0000) | \
+        ( (desc)->b        & 0xff000000)   )
+#define GET_LIMIT(desc) ( \
+        ((desc)->a & 0x0ffff) | \
+         ((desc)->b & 0xf0000) )
+        
+#define GET_32BIT(desc)         (((desc)->b >> 22) & 1)
+#define GET_CONTENTS(desc)      (((desc)->b >> 10) & 3)
+#define GET_WRITABLE(desc)      (((desc)->b >>  9) & 1)
+#define GET_LIMIT_PAGES(desc)   (((desc)->b >> 23) & 1)
+#define GET_PRESENT(desc)       (((desc)->b >> 15) & 1)
+#define GET_USEABLE(desc)       (((desc)->b >> 20) & 1)
+#define GET_LONGMODE(desc)      (((desc)->b >> 21) & 1)
+int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
+{
+        struct user_desc info;
+        struct n_desc_struct *desc;
+        int idx;
+        if (get_user(idx, &u_info->entry_number))
+                return -EFAULT;
+        if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+                return -EINVAL;
+        desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
+        memset(&info, 0, sizeof(struct user_desc));
+        info.entry_number = idx;
+        info.base_addr = GET_BASE(desc);
+        info.limit = GET_LIMIT(desc);
+        info.seg_32bit = GET_32BIT(desc);
+        info.contents = GET_CONTENTS(desc);
+        info.read_exec_only = !GET_WRITABLE(desc);
+        info.limit_in_pages = GET_LIMIT_PAGES(desc);
+        info.seg_not_present = !GET_PRESENT(desc);
+        info.useable = GET_USEABLE(desc);
+        info.lm = GET_LONGMODE(desc);
+        if (copy_to_user(u_info, &info, sizeof(info)))
+                return -EFAULT;
+        return 0;
+}
+asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info)
+{
+        return do_get_thread_area(&current->thread, u_info);
+} 
+int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs)
+{
+        struct n_desc_struct *desc;
+        struct user_desc info;
+        struct user_desc __user *cp;
+        int idx;
+        
+        cp = (void __user *)childregs->rsi;
+        if (copy_from_user(&info, cp, sizeof(info)))
+                return -EFAULT;
+        if (LDT_empty(&info))
+                return -EINVAL;
+        
+        idx = info.entry_number;
+        if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+                return -EINVAL;
+        
+        desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN;
+        desc->a = LDT_entry_a(&info);
+        desc->b = LDT_entry_b(&info);
+        return 0;
+}
diff --git a/arch/x86/ia32/vsyscall-sigreturn.S b/arch/x86/ia32/vsyscall-sigreturn.S
new file mode 100644
index 000000000000..b383be00baec
--- /dev/null
+++ b/arch/x86/ia32/vsyscall-sigreturn.S
@@ -0,0 +1,143 @@
+/*
+ * Common code for the sigreturn entry points on the vsyscall page.
+ * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80)
+ * to enter the kernel.
+ * This file is #include'd by vsyscall-*.S to define them after the
+ * vsyscall entry point.  The addresses we get for these entry points
+ * by doing ".balign 32" must match in both versions of the page.
+ */
+        .code32
+        .section .text.sigreturn,"ax"
+        .balign 32
+        .globl __kernel_sigreturn
+        .type __kernel_sigreturn,@function
+__kernel_sigreturn:
+.LSTART_sigreturn:
+        popl %eax
+        movl $__NR_ia32_sigreturn, %eax
+        SYSCALL_ENTER_KERNEL
+.LEND_sigreturn:
+        .size __kernel_sigreturn,.-.LSTART_sigreturn
+        .section .text.rtsigreturn,"ax"
+        .balign 32
+        .globl __kernel_rt_sigreturn
+        .type __kernel_rt_sigreturn,@function
+__kernel_rt_sigreturn:
+.LSTART_rt_sigreturn:
+        movl $__NR_ia32_rt_sigreturn, %eax
+        SYSCALL_ENTER_KERNEL
+.LEND_rt_sigreturn:
+        .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
+        .section .eh_frame,"a",@progbits
+.LSTARTFRAMES:
+        .long .LENDCIES-.LSTARTCIES
+.LSTARTCIES:
+        .long 0                 /* CIE ID */
+        .byte 1                 /* Version number */
+        .string "zRS"           /* NUL-terminated augmentation string */
+        .uleb128 1              /* Code alignment factor */
+        .sleb128 -4             /* Data alignment factor */
+        .byte 8                 /* Return address register column */
+        .uleb128 1              /* Augmentation value length */
+        .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+        .byte 0x0c              /* DW_CFA_def_cfa */
+        .uleb128 4
+        .uleb128 4
+        .byte 0x88              /* DW_CFA_offset, column 0x8 */
+        .uleb128 1
+        .align 4
+.LENDCIES:
+        .long .LENDFDE2-.LSTARTFDE2     /* Length FDE */
+.LSTARTFDE2:
+        .long .LSTARTFDE2-.LSTARTFRAMES /* CIE pointer */
+        /* HACK: The dwarf2 unwind routines will subtract 1 from the
+           return address to get an address in the middle of the
+           presumed call instruction.  Since we didn't get here via
+           a call, we need to include the nop before the real start
+           to make up for it.  */
+        .long .LSTART_sigreturn-1-.     /* PC-relative start address */
+        .long .LEND_sigreturn-.LSTART_sigreturn+1
+        .uleb128 0                      /* Augmentation length */
+        /* What follows are the instructions for the table generation.
+           We record the locations of each register saved.  This is
+           complicated by the fact that the "CFA" is always assumed to
+           be the value of the stack pointer in the caller.  This means
+           that we must define the CFA of this body of code to be the
+           saved value of the stack pointer in the sigcontext.  Which
+           also means that there is no fixed relation to the other 
+           saved registers, which means that we must use DW_CFA_expression
+           to compute their addresses.  It also means that when we 
+           adjust the stack with the popl, we have to do it all over again.  */
+#define do_cfa_expr(offset)                                             \
+        .byte 0x0f;                     /* DW_CFA_def_cfa_expression */ \
+        .uleb128 1f-0f;                 /*   length */                  \
+0:      .byte 0x74;                     /*     DW_OP_breg4 */           \
+        .sleb128 offset;                /*      offset */               \
+        .byte 0x06;                     /*     DW_OP_deref */           \
+1:
+#define do_expr(regno, offset)                                          \
+        .byte 0x10;                     /* DW_CFA_expression */         \
+        .uleb128 regno;                 /*   regno */                   \
+        .uleb128 1f-0f;                 /*   length */                  \
+0:      .byte 0x74;                     /*     DW_OP_breg4 */           \
+        .sleb128 offset;                /*       offset */              \
+1:
+        do_cfa_expr(IA32_SIGCONTEXT_esp+4)
+        do_expr(0, IA32_SIGCONTEXT_eax+4)
+        do_expr(1, IA32_SIGCONTEXT_ecx+4)
+        do_expr(2, IA32_SIGCONTEXT_edx+4)
+        do_expr(3, IA32_SIGCONTEXT_ebx+4)
+        do_expr(5, IA32_SIGCONTEXT_ebp+4)
+        do_expr(6, IA32_SIGCONTEXT_esi+4)
+        do_expr(7, IA32_SIGCONTEXT_edi+4)
+        do_expr(8, IA32_SIGCONTEXT_eip+4)
+        .byte 0x42      /* DW_CFA_advance_loc 2 -- nop; popl eax. */
+        do_cfa_expr(IA32_SIGCONTEXT_esp)
+        do_expr(0, IA32_SIGCONTEXT_eax)
+        do_expr(1, IA32_SIGCONTEXT_ecx)
+        do_expr(2, IA32_SIGCONTEXT_edx)
+        do_expr(3, IA32_SIGCONTEXT_ebx)
+        do_expr(5, IA32_SIGCONTEXT_ebp)
+        do_expr(6, IA32_SIGCONTEXT_esi)
+        do_expr(7, IA32_SIGCONTEXT_edi)
+        do_expr(8, IA32_SIGCONTEXT_eip)
+        .align 4
+.LENDFDE2:
+        .long .LENDFDE3-.LSTARTFDE3     /* Length FDE */
+.LSTARTFDE3:
+        .long .LSTARTFDE3-.LSTARTFRAMES /* CIE pointer */
+        /* HACK: See above wrt unwind library assumptions.  */
+        .long .LSTART_rt_sigreturn-1-.  /* PC-relative start address */
+        .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
+        .uleb128 0                      /* Augmentation */
+        /* What follows are the instructions for the table generation.
+           We record the locations of each register saved.  This is
+           slightly less complicated than the above, since we don't
+           modify the stack pointer in the process.  */
+        do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp)
+        do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax)
+        do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx)
+        do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx)
+        do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx)
+        do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp)
+        do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi)
+        do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi)
+        do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip)
+        .align 4
+.LENDFDE3:
+#include "../../x86/kernel/vsyscall-note_32.S"
diff --git a/arch/x86/ia32/vsyscall-syscall.S b/arch/x86/ia32/vsyscall-syscall.S
new file mode 100644
index 000000000000..cf9ef678de3e
--- /dev/null
+++ b/arch/x86/ia32/vsyscall-syscall.S
@@ -0,0 +1,69 @@
+/*
+ * Code for the vsyscall page.  This version uses the syscall instruction.
+ */
+#include <asm/ia32_unistd.h>
+#include <asm/asm-offsets.h>
+#include <asm/segment.h>
+        .code32
+        .text
+        .section .text.vsyscall,"ax"
+        .globl __kernel_vsyscall
+        .type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+        push    %ebp
+.Lpush_ebp:
+        movl    %ecx, %ebp
+        syscall
+        movl    $__USER32_DS, %ecx
+        movl    %ecx, %ss
+        movl    %ebp, %ecx
+        popl    %ebp
+.Lpop_ebp:
+        ret
+.LEND_vsyscall:
+        .size __kernel_vsyscall,.-.LSTART_vsyscall
+        .section .eh_frame,"a",@progbits
+.LSTARTFRAME:
+        .long .LENDCIE-.LSTARTCIE
+.LSTARTCIE:
+        .long 0                 /* CIE ID */
+        .byte 1                 /* Version number */
+        .string "zR"            /* NUL-terminated augmentation string */
+        .uleb128 1              /* Code alignment factor */
+        .sleb128 -4             /* Data alignment factor */
+        .byte 8                 /* Return address register column */
+        .uleb128 1              /* Augmentation value length */
+        .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+        .byte 0x0c              /* DW_CFA_def_cfa */
+        .uleb128 4
+        .uleb128 4
+        .byte 0x88              /* DW_CFA_offset, column 0x8 */
+        .uleb128 1
+        .align 4
+.LENDCIE:
+        .long .LENDFDE1-.LSTARTFDE1     /* Length FDE */
+.LSTARTFDE1:
+        .long .LSTARTFDE1-.LSTARTFRAME  /* CIE pointer */
+        .long .LSTART_vsyscall-.        /* PC-relative start address */
+        .long .LEND_vsyscall-.LSTART_vsyscall
+        .uleb128 0                      /* Augmentation length */
+        /* What follows are the instructions for the table generation.
+           We have to record all changes of the stack pointer.  */
+        .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .uleb128 8
+        .byte 0x85, 0x02        /* DW_CFA_offset %ebp -8 */
+        .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */
+        .byte 0xc5              /* DW_CFA_restore %ebp */
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .uleb128 4
+        .align 4
+.LENDFDE1:
+#define SYSCALL_ENTER_KERNEL    syscall
+#include "vsyscall-sigreturn.S"
diff --git a/arch/x86/ia32/vsyscall-sysenter.S b/arch/x86/ia32/vsyscall-sysenter.S
new file mode 100644
index 000000000000..ae056e553d13
--- /dev/null
+++ b/arch/x86/ia32/vsyscall-sysenter.S
@@ -0,0 +1,95 @@
+/*
+ * Code for the vsyscall page.  This version uses the sysenter instruction.
+ */
+#include <asm/ia32_unistd.h>
+#include <asm/asm-offsets.h>
+        .code32
+        .text
+        .section .text.vsyscall,"ax"
+        .globl __kernel_vsyscall
+        .type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+        push    %ecx
+.Lpush_ecx:
+        push    %edx
+.Lpush_edx:
+        push    %ebp
+.Lenter_kernel:
+        movl    %esp,%ebp
+        sysenter
+        .space 7,0x90
+        jmp     .Lenter_kernel
+        /* 16: System call normal return point is here! */
+        pop     %ebp
+.Lpop_ebp:
+        pop     %edx
+.Lpop_edx:
+        pop     %ecx
+.Lpop_ecx:
+        ret
+.LEND_vsyscall:
+        .size __kernel_vsyscall,.-.LSTART_vsyscall
+        .section .eh_frame,"a",@progbits
+.LSTARTFRAME:
+        .long .LENDCIE-.LSTARTCIE
+.LSTARTCIE:
+        .long 0                 /* CIE ID */
+        .byte 1                 /* Version number */
+        .string "zR"            /* NUL-terminated augmentation string */
+        .uleb128 1              /* Code alignment factor */
+        .sleb128 -4             /* Data alignment factor */
+        .byte 8                 /* Return address register column */
+        .uleb128 1              /* Augmentation value length */
+        .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+        .byte 0x0c              /* DW_CFA_def_cfa */
+        .uleb128 4
+        .uleb128 4
+        .byte 0x88              /* DW_CFA_offset, column 0x8 */
+        .uleb128 1
+        .align 4
+.LENDCIE:
+        .long .LENDFDE1-.LSTARTFDE1     /* Length FDE */
+.LSTARTFDE1:
+        .long .LSTARTFDE1-.LSTARTFRAME  /* CIE pointer */
+        .long .LSTART_vsyscall-.        /* PC-relative start address */
+        .long .LEND_vsyscall-.LSTART_vsyscall
+        .uleb128 0                      /* Augmentation length */
+        /* What follows are the instructions for the table generation.
+           We have to record all changes of the stack pointer.  */
+        .byte 0x04              /* DW_CFA_advance_loc4 */
+        .long .Lpush_ecx-.LSTART_vsyscall
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .byte 0x08              /* RA at offset 8 now */
+        .byte 0x04              /* DW_CFA_advance_loc4 */
+        .long .Lpush_edx-.Lpush_ecx
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .byte 0x0c              /* RA at offset 12 now */
+        .byte 0x04              /* DW_CFA_advance_loc4 */
+        .long .Lenter_kernel-.Lpush_edx
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .byte 0x10              /* RA at offset 16 now */
+        .byte 0x85, 0x04        /* DW_CFA_offset %ebp -16 */
+        /* Finally the epilogue.  */
+        .byte 0x04              /* DW_CFA_advance_loc4 */
+        .long .Lpop_ebp-.Lenter_kernel
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .byte 0x12              /* RA at offset 12 now */
+        .byte 0xc5              /* DW_CFA_restore %ebp */
+        .byte 0x04              /* DW_CFA_advance_loc4 */
+        .long .Lpop_edx-.Lpop_ebp
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .byte 0x08              /* RA at offset 8 now */
+        .byte 0x04              /* DW_CFA_advance_loc4 */
+        .long .Lpop_ecx-.Lpop_edx
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .byte 0x04              /* RA at offset 4 now */
+        .align 4
+.LENDFDE1:
+#define SYSCALL_ENTER_KERNEL    int $0x80
+#include "vsyscall-sigreturn.S"
diff --git a/arch/x86/ia32/vsyscall.lds b/arch/x86/ia32/vsyscall.lds
new file mode 100644
index 000000000000..1dc86ff5bcb9
--- /dev/null
+++ b/arch/x86/ia32/vsyscall.lds
@@ -0,0 +1,80 @@
+/*
+ * Linker script for vsyscall DSO.  The vsyscall page is an ELF shared
+ * object prelinked to its virtual address. This script controls its layout.
+ */
+/* This must match <asm/fixmap.h>.  */
+VSYSCALL_BASE = 0xffffe000;
+SECTIONS
+{
+  . = VSYSCALL_BASE + SIZEOF_HEADERS;
+  .hash           : { *(.hash) }                :text
+  .gnu.hash       : { *(.gnu.hash) }
+  .dynsym         : { *(.dynsym) }
+  .dynstr         : { *(.dynstr) }
+  .gnu.version    : { *(.gnu.version) }
+  .gnu.version_d  : { *(.gnu.version_d) }
+  .gnu.version_r  : { *(.gnu.version_r) }
+  /* This linker script is used both with -r and with -shared.
+     For the layouts to match, we need to skip more than enough
+     space for the dynamic symbol table et al.  If this amount
+     is insufficient, ld -shared will barf.  Just increase it here.  */
+  . = VSYSCALL_BASE + 0x400;
+  
+  .text.vsyscall   : { *(.text.vsyscall) }      :text =0x90909090
+  /* This is an 32bit object and we cannot easily get the offsets
+     into the 64bit kernel. Just hardcode them here. This assumes
+     that all the stubs don't need more than 0x100 bytes. */
+  . = VSYSCALL_BASE + 0x500;
+  .text.sigreturn  : { *(.text.sigreturn) }     :text =0x90909090
+  . = VSYSCALL_BASE + 0x600;
+  .text.rtsigreturn : { *(.text.rtsigreturn) }   :text =0x90909090
+        
+  .note           : { *(.note.*) }              :text :note
+  .eh_frame_hdr   : { *(.eh_frame_hdr) }        :text :eh_frame_hdr
+  .eh_frame       : { KEEP (*(.eh_frame)) }     :text
+  .dynamic        : { *(.dynamic) }             :text :dynamic
+  .useless        : {
+        *(.got.plt) *(.got)
+        *(.data .data.* .gnu.linkonce.d.*)
+        *(.dynbss)
+        *(.bss .bss.* .gnu.linkonce.b.*)
+  }                                             :text
+}
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+  note PT_NOTE FLAGS(4); /* PF_R */
+  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+}
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+  LINUX_2.5 {
+    global:
+        __kernel_vsyscall;
+        __kernel_sigreturn;
+        __kernel_rt_sigreturn;
+    local: *;
+  };
+}
+/* The ELF entry point can be used to set the AT_SYSINFO value.  */
+ENTRY(__kernel_vsyscall);
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
new file mode 100644
index 000000000000..40836ad9079c
--- /dev/null
+++ b/arch/x86/kernel/.gitignore
@@ -0,0 +1 @@
+vsyscall.lds
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
new file mode 100644
index 000000000000..45855c97923e
--- /dev/null
+++ b/arch/x86/kernel/Makefile
@@ -0,0 +1,5 @@
+ifeq ($(CONFIG_X86_32),y)
+include ${srctree}/arch/x86/kernel/Makefile_32
+else
+include ${srctree}/arch/x86/kernel/Makefile_64
+endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
new file mode 100644
index 000000000000..c624193740fd
--- /dev/null
+++ b/arch/x86/kernel/Makefile_32
@@ -0,0 +1,86 @@
+#
+# Makefile for the linux kernel.
+#
+extra-y := head_32.o init_task_32.o vmlinux.lds
+obj-y   := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
+                ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
+                pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
+                quirks.o i8237.o topology.o alternative.o i8253_32.o tsc_32.o
+obj-$(CONFIG_STACKTRACE)        += stacktrace.o
+obj-y                           += cpu/
+obj-y                           += acpi/
+obj-$(CONFIG_X86_BIOS_REBOOT)   += reboot_32.o
+obj-$(CONFIG_MCA)               += mca_32.o
+obj-$(CONFIG_X86_MSR)           += msr.o
+obj-$(CONFIG_X86_CPUID)         += cpuid.o
+obj-$(CONFIG_MICROCODE)         += microcode.o
+obj-$(CONFIG_APM)               += apm_32.o
+obj-$(CONFIG_X86_SMP)           += smp_32.o smpboot_32.o tsc_sync.o
+obj-$(CONFIG_SMP)               += smpcommon_32.o
+obj-$(CONFIG_X86_TRAMPOLINE)    += trampoline_32.o
+obj-$(CONFIG_X86_MPPARSE)       += mpparse_32.o
+obj-$(CONFIG_X86_LOCAL_APIC)    += apic_32.o nmi_32.o
+obj-$(CONFIG_X86_IO_APIC)       += io_apic_32.o
+obj-$(CONFIG_X86_REBOOTFIXUPS)  += reboot_fixups_32.o
+obj-$(CONFIG_KEXEC)             += machine_kexec_32.o relocate_kernel_32.o crash_32.o
+obj-$(CONFIG_CRASH_DUMP)        += crash_dump_32.o
+obj-$(CONFIG_X86_NUMAQ)         += numaq_32.o
+obj-$(CONFIG_X86_SUMMIT_NUMA)   += summit_32.o
+obj-$(CONFIG_KPROBES)           += kprobes_32.o
+obj-$(CONFIG_MODULES)           += module_32.o
+obj-y                           += sysenter_32.o vsyscall_32.o
+obj-$(CONFIG_ACPI_SRAT)         += srat_32.o
+obj-$(CONFIG_EFI)               += efi_32.o efi_stub_32.o
+obj-$(CONFIG_DOUBLEFAULT)       += doublefault_32.o
+obj-$(CONFIG_VM86)              += vm86_32.o
+obj-$(CONFIG_EARLY_PRINTK)      += early_printk.o
+obj-$(CONFIG_HPET_TIMER)        += hpet_32.o
+obj-$(CONFIG_K8_NB)             += k8.o
+obj-$(CONFIG_MGEODE_LX)         += geode_32.o
+obj-$(CONFIG_VMI)               += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_PARAVIRT)          += paravirt_32.o
+obj-y                           += pcspeaker.o
+obj-$(CONFIG_SCx200)            += scx200_32.o
+# vsyscall_32.o contains the vsyscall DSO images as __initdata.
+# We must build both images before we can assemble it.
+# Note: kbuild does not track this dependency due to usage of .incbin
+$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so
+targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so)
+targets += vsyscall-note_32.o vsyscall_32.lds
+# The DSO images are built using a special linker script.
+quiet_cmd_syscall = SYSCALL $@
+      cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
+                          -Wl,-T,$(filter-out FORCE,$^) -o $@
+export CPPFLAGS_vsyscall_32.lds += -P -C -U$(ARCH)
+vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
+                 $(call ld-option, -Wl$(comma)--hash-style=sysv)
+SYSCFLAGS_vsyscall-sysenter_32.so       = $(vsyscall-flags)
+SYSCFLAGS_vsyscall-int80_32.so  = $(vsyscall-flags)
+$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \
+$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \
+                      $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE
+        $(call if_changed,syscall)
+# We also create a special relocatable object that should mirror the symbol
+# table and layout of the linked DSO.  With ld -R we can then refer to
+# these symbols in the kernel code rather than hand-coded addresses.
+extra-y += vsyscall-syms.o
+$(obj)/built-in.o: $(obj)/vsyscall-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
+SYSCFLAGS_vsyscall-syms.o = -r
+$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
+                        $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
+        $(call if_changed,syscall)
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
new file mode 100644
index 000000000000..3ab017a0a3b9
--- /dev/null
+++ b/arch/x86/kernel/Makefile_64
@@ -0,0 +1,54 @@
+#
+# Makefile for the linux kernel.
+#
+extra-y         := head_64.o head64.o init_task_64.o vmlinux.lds
+EXTRA_AFLAGS    := -traditional
+obj-y   := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
+                ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \
+                x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
+                setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
+                pci-dma_64.o pci-nommu_64.o alternative.o hpet_64.o tsc_64.o bugs_64.o \
+                perfctr-watchdog.o
+obj-$(CONFIG_STACKTRACE)        += stacktrace.o
+obj-$(CONFIG_X86_MCE)           += mce_64.o therm_throt.o
+obj-$(CONFIG_X86_MCE_INTEL)     += mce_intel_64.o
+obj-$(CONFIG_X86_MCE_AMD)       += mce_amd_64.o
+obj-$(CONFIG_MTRR)              += cpu/mtrr/
+obj-$(CONFIG_ACPI)              += acpi/
+obj-$(CONFIG_X86_MSR)           += msr.o
+obj-$(CONFIG_MICROCODE)         += microcode.o
+obj-$(CONFIG_X86_CPUID)         += cpuid.o
+obj-$(CONFIG_SMP)               += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o
+obj-y                           += apic_64.o  nmi_64.o
+obj-y                           += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o
+obj-$(CONFIG_KEXEC)             += machine_kexec_64.o relocate_kernel_64.o crash_64.o
+obj-$(CONFIG_CRASH_DUMP)        += crash_dump_64.o
+obj-$(CONFIG_PM)                += suspend_64.o
+obj-$(CONFIG_HIBERNATION)       += suspend_asm_64.o
+obj-$(CONFIG_CPU_FREQ)          += cpu/cpufreq/
+obj-$(CONFIG_EARLY_PRINTK)      += early_printk.o
+obj-$(CONFIG_IOMMU)             += pci-gart_64.o aperture_64.o
+obj-$(CONFIG_CALGARY_IOMMU)     += pci-calgary_64.o tce_64.o
+obj-$(CONFIG_SWIOTLB)           += pci-swiotlb_64.o
+obj-$(CONFIG_KPROBES)           += kprobes_64.o
+obj-$(CONFIG_X86_PM_TIMER)      += pmtimer_64.o
+obj-$(CONFIG_X86_VSMP)          += vsmp_64.o
+obj-$(CONFIG_K8_NB)             += k8.o
+obj-$(CONFIG_AUDIT)             += audit_64.o
+obj-$(CONFIG_MODULES)           += module_64.o
+obj-$(CONFIG_PCI)               += early-quirks_64.o
+obj-y                           += topology.o
+obj-y                           += intel_cacheinfo.o
+obj-y                           += addon_cpuid_features.o
+obj-y                           += pcspeaker.o
+CFLAGS_vsyscall_64.o            := $(PROFILING) -g0
+therm_throt-y                   += cpu/mcheck/therm_throt.o
+intel_cacheinfo-y               += cpu/intel_cacheinfo.o
+addon_cpuid_features-y          += cpu/addon_cpuid_features.o
+perfctr-watchdog-y              += cpu/perfctr-watchdog.o
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
new file mode 100644
index 000000000000..3d5671939542
--- /dev/null
+++ b/arch/x86/kernel/acpi/Makefile
@@ -0,0 +1,5 @@
+ifeq ($(CONFIG_X86_32),y)
+include ${srctree}/arch/x86/kernel/acpi/Makefile_32
+else
+include ${srctree}/arch/x86/kernel/acpi/Makefile_64
+endif
diff --git a/arch/x86/kernel/acpi/Makefile_32 b/arch/x86/kernel/acpi/Makefile_32
new file mode 100644
index 000000000000..a4852a2e9190
--- /dev/null
+++ b/arch/x86/kernel/acpi/Makefile_32
@@ -0,0 +1,10 @@
+obj-$(CONFIG_ACPI)              += boot.o
+ifneq ($(CONFIG_PCI),)
+obj-$(CONFIG_X86_IO_APIC)       += earlyquirk_32.o
+endif
+obj-$(CONFIG_ACPI_SLEEP)        += sleep_32.o wakeup_32.o
+ifneq ($(CONFIG_ACPI_PROCESSOR),)
+obj-y                           += cstate.o processor.o
+endif
diff --git a/arch/x86/kernel/acpi/Makefile_64 b/arch/x86/kernel/acpi/Makefile_64
new file mode 100644
index 000000000000..629425bc002d
--- /dev/null
+++ b/arch/x86/kernel/acpi/Makefile_64
@@ -0,0 +1,7 @@
+obj-y                   := boot.o
+obj-$(CONFIG_ACPI_SLEEP)        += sleep_64.o wakeup_64.o
+ifneq ($(CONFIG_ACPI_PROCESSOR),)
+obj-y                   += processor.o cstate.o
+endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
new file mode 100644
index 000000000000..afd2afe9102d
--- /dev/null
+++ b/arch/x86/kernel/acpi/boot.c
@@ -0,0 +1,1326 @@
+/*
+ *  boot.c - Architecture-Specific Low-Level ACPI Boot Support
+ *
+ *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/efi.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/dmi.h>
+#include <linux/irq.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <asm/pgtable.h>
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+#include <asm/io.h>
+#include <asm/mpspec.h>
+static int __initdata acpi_force = 0;
+#ifdef  CONFIG_ACPI
+int acpi_disabled = 0;
+#else
+int acpi_disabled = 1;
+#endif
+EXPORT_SYMBOL(acpi_disabled);
+#ifdef  CONFIG_X86_64
+#include <asm/proto.h>
+static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
+#else                           /* X86 */
+#ifdef  CONFIG_X86_LOCAL_APIC
+#include <mach_apic.h>
+#include <mach_mpparse.h>
+#endif                          /* CONFIG_X86_LOCAL_APIC */
+#endif                          /* X86 */
+#define BAD_MADT_ENTRY(entry, end) (                                        \
+                (!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
+                ((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
+#define PREFIX                  "ACPI: "
+int acpi_noirq;                         /* skip ACPI IRQ initialization */
+int acpi_pci_disabled __initdata;       /* skip ACPI PCI scan and IRQ initialization */
+int acpi_ht __initdata = 1;     /* enable HT */
+int acpi_lapic;
+int acpi_ioapic;
+int acpi_strict;
+EXPORT_SYMBOL(acpi_strict);
+u8 acpi_sci_flags __initdata;
+int acpi_sci_override_gsi __initdata;
+int acpi_skip_timer_override __initdata;
+int acpi_use_timer_override __initdata;
+#ifdef CONFIG_X86_LOCAL_APIC
+static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+#endif
+#ifndef __HAVE_ARCH_CMPXCHG
+#warning ACPI uses CMPXCHG, i486 and later hardware
+#endif
+/* --------------------------------------------------------------------------
+                              Boot-time Configuration
+   -------------------------------------------------------------------------- */
+/*
+ * The default interrupt routing model is PIC (8259).  This gets
+ * overriden if IOAPICs are enumerated (below).
+ */
+enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
+#ifdef  CONFIG_X86_64
+/* rely on all ACPI tables being in the direct mapping */
+char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
+{
+        if (!phys_addr || !size)
+                return NULL;
+        if (phys_addr+size <= (end_pfn_map << PAGE_SHIFT) + PAGE_SIZE)
+                return __va(phys_addr);
+        return NULL;
+}
+#else
+/*
+ * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
+ * to map the target physical address. The problem is that set_fixmap()
+ * provides a single page, and it is possible that the page is not
+ * sufficient.
+ * By using this area, we can map up to MAX_IO_APICS pages temporarily,
+ * i.e. until the next __va_range() call.
+ *
+ * Important Safety Note:  The fixed I/O APIC page numbers are *subtracted*
+ * from the fixed base.  That's why we start at FIX_IO_APIC_BASE_END and
+ * count idx down while incrementing the phys address.
+ */
+char *__acpi_map_table(unsigned long phys, unsigned long size)
+{
+        unsigned long base, offset, mapped_size;
+        int idx;
+        if (phys + size < 8 * 1024 * 1024)
+                return __va(phys);
+        offset = phys & (PAGE_SIZE - 1);
+        mapped_size = PAGE_SIZE - offset;
+        set_fixmap(FIX_ACPI_END, phys);
+        base = fix_to_virt(FIX_ACPI_END);
+        /*
+         * Most cases can be covered by the below.
+         */
+        idx = FIX_ACPI_END;
+        while (mapped_size < size) {
+                if (--idx < FIX_ACPI_BEGIN)
+                        return NULL;    /* cannot handle this */
+                phys += PAGE_SIZE;
+                set_fixmap(idx, phys);
+                mapped_size += PAGE_SIZE;
+        }
+        return ((unsigned char *)base + offset);
+}
+#endif
+#ifdef CONFIG_PCI_MMCONFIG
+/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
+struct acpi_mcfg_allocation *pci_mmcfg_config;
+int pci_mmcfg_config_num;
+int __init acpi_parse_mcfg(struct acpi_table_header *header)
+{
+        struct acpi_table_mcfg *mcfg;
+        unsigned long i;
+        int config_size;
+        if (!header)
+                return -EINVAL;
+        mcfg = (struct acpi_table_mcfg *)header;
+        /* how many config structures do we have */
+        pci_mmcfg_config_num = 0;
+        i = header->length - sizeof(struct acpi_table_mcfg);
+        while (i >= sizeof(struct acpi_mcfg_allocation)) {
+                ++pci_mmcfg_config_num;
+                i -= sizeof(struct acpi_mcfg_allocation);
+        };
+        if (pci_mmcfg_config_num == 0) {
+                printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
+                return -ENODEV;
+        }
+        config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
+        pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
+        if (!pci_mmcfg_config) {
+                printk(KERN_WARNING PREFIX
+                       "No memory for MCFG config tables\n");
+                return -ENOMEM;
+        }
+        memcpy(pci_mmcfg_config, &mcfg[1], config_size);
+        for (i = 0; i < pci_mmcfg_config_num; ++i) {
+                if (pci_mmcfg_config[i].address > 0xFFFFFFFF) {
+                        printk(KERN_ERR PREFIX
+                               "MMCONFIG not in low 4GB of memory\n");
+                        kfree(pci_mmcfg_config);
+                        pci_mmcfg_config_num = 0;
+                        return -ENODEV;
+                }
+        }
+        return 0;
+}
+#endif                          /* CONFIG_PCI_MMCONFIG */
+#ifdef CONFIG_X86_LOCAL_APIC
+static int __init acpi_parse_madt(struct acpi_table_header *table)
+{
+        struct acpi_table_madt *madt = NULL;
+        if (!cpu_has_apic)
+                return -EINVAL;
+        madt = (struct acpi_table_madt *)table;
+        if (!madt) {
+                printk(KERN_WARNING PREFIX "Unable to map MADT\n");
+                return -ENODEV;
+        }
+        if (madt->address) {
+                acpi_lapic_addr = (u64) madt->address;
+                printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
+                       madt->address);
+        }
+        acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
+        return 0;
+}
+static int __init
+acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
+{
+        struct acpi_madt_local_apic *processor = NULL;
+        processor = (struct acpi_madt_local_apic *)header;
+        if (BAD_MADT_ENTRY(processor, end))
+                return -EINVAL;
+        acpi_table_print_madt_entry(header);
+        /*
+         * We need to register disabled CPU as well to permit
+         * counting disabled CPUs. This allows us to size
+         * cpus_possible_map more accurately, to permit
+         * to not preallocating memory for all NR_CPUS
+         * when we use CPU hotplug.
+         */
+        mp_register_lapic(processor->id,        /* APIC ID */
+                          processor->lapic_flags & ACPI_MADT_ENABLED);  /* Enabled? */
+        return 0;
+}
+static int __init
+acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
+                          const unsigned long end)
+{
+        struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
+        lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header;
+        if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
+                return -EINVAL;
+        acpi_lapic_addr = lapic_addr_ovr->address;
+        return 0;
+}
+static int __init
+acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
+{
+        struct acpi_madt_local_apic_nmi *lapic_nmi = NULL;
+        lapic_nmi = (struct acpi_madt_local_apic_nmi *)header;
+        if (BAD_MADT_ENTRY(lapic_nmi, end))
+                return -EINVAL;
+        acpi_table_print_madt_entry(header);
+        if (lapic_nmi->lint != 1)
+                printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+        return 0;
+}
+#endif                          /*CONFIG_X86_LOCAL_APIC */
+#ifdef CONFIG_X86_IO_APIC
+static int __init
+acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
+{
+        struct acpi_madt_io_apic *ioapic = NULL;
+        ioapic = (struct acpi_madt_io_apic *)header;
+        if (BAD_MADT_ENTRY(ioapic, end))
+                return -EINVAL;
+        acpi_table_print_madt_entry(header);
+        mp_register_ioapic(ioapic->id,
+                           ioapic->address, ioapic->global_irq_base);
+        return 0;
+}
+/*
+ * Parse Interrupt Source Override for the ACPI SCI
+ */
+static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
+{
+        if (trigger == 0)       /* compatible SCI trigger is level */
+                trigger = 3;
+        if (polarity == 0)      /* compatible SCI polarity is low */
+                polarity = 3;
+        /* Command-line over-ride via acpi_sci= */
+        if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)
+                trigger = (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2;
+        if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
+                polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
+        /*
+         * mp_config_acpi_legacy_irqs() already setup IRQs < 16
+         * If GSI is < 16, this will update its flags,
+         * else it will create a new mp_irqs[] entry.
+         */
+        mp_override_legacy_irq(gsi, polarity, trigger, gsi);
+        /*
+         * stash over-ride to indicate we've been here
+         * and for later update of acpi_gbl_FADT
+         */
+        acpi_sci_override_gsi = gsi;
+        return;
+}
+static int __init
+acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
+                       const unsigned long end)
+{
+        struct acpi_madt_interrupt_override *intsrc = NULL;
+        intsrc = (struct acpi_madt_interrupt_override *)header;
+        if (BAD_MADT_ENTRY(intsrc, end))
+                return -EINVAL;
+        acpi_table_print_madt_entry(header);
+        if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
+                acpi_sci_ioapic_setup(intsrc->global_irq,
+                                      intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
+                                      (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2);
+                return 0;
+        }
+        if (acpi_skip_timer_override &&
+            intsrc->source_irq == 0 && intsrc->global_irq == 2) {
+                printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
+                return 0;
+        }
+        mp_override_legacy_irq(intsrc->source_irq,
+                                intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
+                                (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+                                intsrc->global_irq);
+        return 0;
+}
+static int __init
+acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end)
+{
+        struct acpi_madt_nmi_source *nmi_src = NULL;
+        nmi_src = (struct acpi_madt_nmi_source *)header;
+        if (BAD_MADT_ENTRY(nmi_src, end))
+                return -EINVAL;
+        acpi_table_print_madt_entry(header);
+        /* TBD: Support nimsrc entries? */
+        return 0;
+}
+#endif                          /* CONFIG_X86_IO_APIC */
+/*
+ * acpi_pic_sci_set_trigger()
+ *
+ * use ELCR to set PIC-mode trigger type for SCI
+ *
+ * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
+ * it may require Edge Trigger -- use "acpi_sci=edge"
+ *
+ * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
+ * for the 8259 PIC.  bit[n] = 1 means irq[n] is Level, otherwise Edge.
+ * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0)
+ * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0)
+ */
+void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
+{
+        unsigned int mask = 1 << irq;
+        unsigned int old, new;
+        /* Real old ELCR mask */
+        old = inb(0x4d0) | (inb(0x4d1) << 8);
+        /*
+         * If we use ACPI to set PCI irq's, then we should clear ELCR
+         * since we will set it correctly as we enable the PCI irq
+         * routing.
+         */
+        new = acpi_noirq ? old : 0;
+        /*
+         * Update SCI information in the ELCR, it isn't in the PCI
+         * routing tables..
+         */
+        switch (trigger) {
+        case 1:         /* Edge - clear */
+                new &= ~mask;
+                break;
+        case 3:         /* Level - set */
+                new |= mask;
+                break;
+        }
+        if (old == new)
+                return;
+        printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
+        outb(new, 0x4d0);
+        outb(new >> 8, 0x4d1);
+}
+int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
+{
+        *irq = gsi;
+        return 0;
+}
+/*
+ * success: return IRQ number (>=0)
+ * failure: return < 0
+ */
+int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+{
+        unsigned int irq;
+        unsigned int plat_gsi = gsi;
+#ifdef CONFIG_PCI
+        /*
+         * Make sure all (legacy) PCI IRQs are set as level-triggered.
+         */
+        if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
+                extern void eisa_set_level_irq(unsigned int irq);
+                if (triggering == ACPI_LEVEL_SENSITIVE)
+                        eisa_set_level_irq(gsi);
+        }
+#endif
+#ifdef CONFIG_X86_IO_APIC
+        if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
+                plat_gsi = mp_register_gsi(gsi, triggering, polarity);
+        }
+#endif
+        acpi_gsi_to_irq(plat_gsi, &irq);
+        return irq;
+}
+EXPORT_SYMBOL(acpi_register_gsi);
+/*
+ *  ACPI based hotplug support for CPU
+ */
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+int acpi_map_lsapic(acpi_handle handle, int *pcpu)
+{
+        struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+        union acpi_object *obj;
+        struct acpi_madt_local_apic *lapic;
+        cpumask_t tmp_map, new_map;
+        u8 physid;
+        int cpu;
+        if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+                return -EINVAL;
+        if (!buffer.length || !buffer.pointer)
+                return -EINVAL;
+        obj = buffer.pointer;
+        if (obj->type != ACPI_TYPE_BUFFER ||
+            obj->buffer.length < sizeof(*lapic)) {
+                kfree(buffer.pointer);
+                return -EINVAL;
+        }
+        lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
+        if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
+            !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
+                kfree(buffer.pointer);
+                return -EINVAL;
+        }
+        physid = lapic->id;
+        kfree(buffer.pointer);
+        buffer.length = ACPI_ALLOCATE_BUFFER;
+        buffer.pointer = NULL;
+        tmp_map = cpu_present_map;
+        mp_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
+        /*
+         * If mp_register_lapic successfully generates a new logical cpu
+         * number, then the following will get us exactly what was mapped
+         */
+        cpus_andnot(new_map, cpu_present_map, tmp_map);
+        if (cpus_empty(new_map)) {
+                printk ("Unable to map lapic to logical cpu number\n");
+                return -EINVAL;
+        }
+        cpu = first_cpu(new_map);
+        *pcpu = cpu;
+        return 0;
+}
+EXPORT_SYMBOL(acpi_map_lsapic);
+int acpi_unmap_lsapic(int cpu)
+{
+        x86_cpu_to_apicid[cpu] = -1;
+        cpu_clear(cpu, cpu_present_map);
+        num_processors--;
+        return (0);
+}
+EXPORT_SYMBOL(acpi_unmap_lsapic);
+#endif                          /* CONFIG_ACPI_HOTPLUG_CPU */
+int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
+{
+        /* TBD */
+        return -EINVAL;
+}
+EXPORT_SYMBOL(acpi_register_ioapic);
+int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
+{
+        /* TBD */
+        return -EINVAL;
+}
+EXPORT_SYMBOL(acpi_unregister_ioapic);
+static unsigned long __init
+acpi_scan_rsdp(unsigned long start, unsigned long length)
+{
+        unsigned long offset = 0;
+        unsigned long sig_len = sizeof("RSD PTR ") - 1;
+        /*
+         * Scan all 16-byte boundaries of the physical memory region for the
+         * RSDP signature.
+         */
+        for (offset = 0; offset < length; offset += 16) {
+                if (strncmp((char *)(phys_to_virt(start) + offset), "RSD PTR ", sig_len))
+                        continue;
+                return (start + offset);
+        }
+        return 0;
+}
+static int __init acpi_parse_sbf(struct acpi_table_header *table)
+{
+        struct acpi_table_boot *sb;
+        sb = (struct acpi_table_boot *)table;
+        if (!sb) {
+                printk(KERN_WARNING PREFIX "Unable to map SBF\n");
+                return -ENODEV;
+        }
+        sbf_port = sb->cmos_index;      /* Save CMOS port */
+        return 0;
+}
+#ifdef CONFIG_HPET_TIMER
+#include <asm/hpet.h>
+static struct __initdata resource *hpet_res;
+static int __init acpi_parse_hpet(struct acpi_table_header *table)
+{
+        struct acpi_table_hpet *hpet_tbl;
+        hpet_tbl = (struct acpi_table_hpet *)table;
+        if (!hpet_tbl) {
+                printk(KERN_WARNING PREFIX "Unable to map HPET\n");
+                return -ENODEV;
+        }
+        if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
+                printk(KERN_WARNING PREFIX "HPET timers must be located in "
+                       "memory.\n");
+                return -1;
+        }
+        hpet_address = hpet_tbl->address.address;
+        printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
+               hpet_tbl->id, hpet_address);
+        /*
+         * Allocate and initialize the HPET firmware resource for adding into
+         * the resource tree during the lateinit timeframe.
+         */
+#define HPET_RESOURCE_NAME_SIZE 9
+        hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
+        if (!hpet_res)
+                return 0;
+        memset(hpet_res, 0, sizeof(*hpet_res));
+        hpet_res->name = (void *)&hpet_res[1];
+        hpet_res->flags = IORESOURCE_MEM;
+        snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u",
+                 hpet_tbl->sequence);
+        hpet_res->start = hpet_address;
+        hpet_res->end = hpet_address + (1 * 1024) - 1;
+        return 0;
+}
+/*
+ * hpet_insert_resource inserts the HPET resources used into the resource
+ * tree.
+ */
+static __init int hpet_insert_resource(void)
+{
+        if (!hpet_res)
+                return 1;
+        return insert_resource(&iomem_resource, hpet_res);
+}
+late_initcall(hpet_insert_resource);
+#else
+#define acpi_parse_hpet NULL
+#endif
+static int __init acpi_parse_fadt(struct acpi_table_header *table)
+{
+#ifdef CONFIG_X86_PM_TIMER
+        /* detect the location of the ACPI PM Timer */
+        if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) {
+                /* FADT rev. 2 */
+                if (acpi_gbl_FADT.xpm_timer_block.space_id !=
+                    ACPI_ADR_SPACE_SYSTEM_IO)
+                        return 0;
+                pmtmr_ioport = acpi_gbl_FADT.xpm_timer_block.address;
+                /*
+                 * "X" fields are optional extensions to the original V1.0
+                 * fields, so we must selectively expand V1.0 fields if the
+                 * corresponding X field is zero.
+                 */
+                if (!pmtmr_ioport)
+                        pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
+        } else {
+                /* FADT rev. 1 */
+                pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
+        }
+        if (pmtmr_ioport)
+                printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
+                       pmtmr_ioport);
+#endif
+        return 0;
+}
+unsigned long __init acpi_find_rsdp(void)
+{
+        unsigned long rsdp_phys = 0;
+        if (efi_enabled) {
+                if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
+                        return efi.acpi20;
+                else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
+                        return efi.acpi;
+        }
+        /*
+         * Scan memory looking for the RSDP signature. First search EBDA (low
+         * memory) paragraphs and then search upper memory (E0000-FFFFF).
+         */
+        rsdp_phys = acpi_scan_rsdp(0, 0x400);
+        if (!rsdp_phys)
+                rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000);
+        return rsdp_phys;
+}
+#ifdef  CONFIG_X86_LOCAL_APIC
+/*
+ * Parse LAPIC entries in MADT
+ * returns 0 on success, < 0 on error
+ */
+static int __init acpi_parse_madt_lapic_entries(void)
+{
+        int count;
+        if (!cpu_has_apic)
+                return -ENODEV;
+        /*
+         * Note that the LAPIC address is obtained from the MADT (32-bit value)
+         * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
+         */
+        count =
+            acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+                                  acpi_parse_lapic_addr_ovr, 0);
+        if (count < 0) {
+                printk(KERN_ERR PREFIX
+                       "Error parsing LAPIC address override entry\n");
+                return count;
+        }
+        mp_register_lapic_address(acpi_lapic_addr);
+        count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, acpi_parse_lapic,
+                                      MAX_APICS);
+        if (!count) {
+                printk(KERN_ERR PREFIX "No LAPIC entries present\n");
+                /* TBD: Cleanup to allow fallback to MPS */
+                return -ENODEV;
+        } else if (count < 0) {
+                printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
+                /* TBD: Cleanup to allow fallback to MPS */
+                return count;
+        }
+        count =
+            acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
+        if (count < 0) {
+                printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
+                /* TBD: Cleanup to allow fallback to MPS */
+                return count;
+        }
+        return 0;
+}
+#endif                          /* CONFIG_X86_LOCAL_APIC */
+#ifdef  CONFIG_X86_IO_APIC
+/*
+ * Parse IOAPIC related entries in MADT
+ * returns 0 on success, < 0 on error
+ */
+static int __init acpi_parse_madt_ioapic_entries(void)
+{
+        int count;
+        /*
+         * ACPI interpreter is required to complete interrupt setup,
+         * so if it is off, don't enumerate the io-apics with ACPI.
+         * If MPS is present, it will handle them,
+         * otherwise the system will stay in PIC mode
+         */
+        if (acpi_disabled || acpi_noirq) {
+                return -ENODEV;
+        }
+        if (!cpu_has_apic)
+                return -ENODEV;
+        /*
+         * if "noapic" boot option, don't look for IO-APICs
+         */
+        if (skip_ioapic_setup) {
+                printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
+                       "due to 'noapic' option.\n");
+                return -ENODEV;
+        }
+        count =
+            acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
+                                  MAX_IO_APICS);
+        if (!count) {
+                printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
+                return -ENODEV;
+        } else if (count < 0) {
+                printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
+                return count;
+        }
+        count =
+            acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
+                                  NR_IRQ_VECTORS);
+        if (count < 0) {
+                printk(KERN_ERR PREFIX
+                       "Error parsing interrupt source overrides entry\n");
+                /* TBD: Cleanup to allow fallback to MPS */
+                return count;
+        }
+        /*
+         * If BIOS did not supply an INT_SRC_OVR for the SCI
+         * pretend we got one so we can set the SCI flags.
+         */
+        if (!acpi_sci_override_gsi)
+                acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
+        /* Fill in identity legacy mapings where no override */
+        mp_config_acpi_legacy_irqs();
+        count =
+            acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
+                                  NR_IRQ_VECTORS);
+        if (count < 0) {
+                printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
+                /* TBD: Cleanup to allow fallback to MPS */
+                return count;
+        }
+        return 0;
+}
+#else
+static inline int acpi_parse_madt_ioapic_entries(void)
+{
+        return -1;
+}
+#endif  /* !CONFIG_X86_IO_APIC */
+static void __init acpi_process_madt(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+        int error;
+        if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
+                /*
+                 * Parse MADT LAPIC entries
+                 */
+                error = acpi_parse_madt_lapic_entries();
+                if (!error) {
+                        acpi_lapic = 1;
+#ifdef CONFIG_X86_GENERICARCH
+                        generic_bigsmp_probe();
+#endif
+                        /*
+                         * Parse MADT IO-APIC entries
+                         */
+                        error = acpi_parse_madt_ioapic_entries();
+                        if (!error) {
+                                acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
+                                acpi_irq_balance_set(NULL);
+                                acpi_ioapic = 1;
+                                smp_found_config = 1;
+                                setup_apic_routing();
+                        }
+                }
+                if (error == -EINVAL) {
+                        /*
+                         * Dell Precision Workstation 410, 610 come here.
+                         */
+                        printk(KERN_ERR PREFIX
+                               "Invalid BIOS MADT, disabling ACPI\n");
+                        disable_acpi();
+                }
+        }
+#endif
+        return;
+}
+#ifdef __i386__
+static int __init disable_acpi_irq(const struct dmi_system_id *d)
+{
+        if (!acpi_force) {
+                printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
+                       d->ident);
+                acpi_noirq_set();
+        }
+        return 0;
+}
+static int __init disable_acpi_pci(const struct dmi_system_id *d)
+{
+        if (!acpi_force) {
+                printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
+                       d->ident);
+                acpi_disable_pci();
+        }
+        return 0;
+}
+static int __init dmi_disable_acpi(const struct dmi_system_id *d)
+{
+        if (!acpi_force) {
+                printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
+                disable_acpi();
+        } else {
+                printk(KERN_NOTICE
+                       "Warning: DMI blacklist says broken, but acpi forced\n");
+        }
+        return 0;
+}
+/*
+ * Limit ACPI to CPU enumeration for HT
+ */
+static int __init force_acpi_ht(const struct dmi_system_id *d)
+{
+        if (!acpi_force) {
+                printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
+                       d->ident);
+                disable_acpi();
+                acpi_ht = 1;
+        } else {
+                printk(KERN_NOTICE
+                       "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
+        }
+        return 0;
+}
+/*
+ * If your system is blacklisted here, but you find that acpi=force
+ * works for you, please contact acpi-devel@sourceforge.net
+ */
+static struct dmi_system_id __initdata acpi_dmi_table[] = {
+        /*
+         * Boxes that need ACPI disabled
+         */
+        {
+         .callback = dmi_disable_acpi,
+         .ident = "IBM Thinkpad",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+                     DMI_MATCH(DMI_BOARD_NAME, "2629H1G"),
+                     },
+         },
+        /*
+         * Boxes that need acpi=ht
+         */
+        {
+         .callback = force_acpi_ht,
+         .ident = "FSC Primergy T850",
+         .matches = {
+                     DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
+                     DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
+                     },
+         },
+        {
+         .callback = force_acpi_ht,
+         .ident = "HP VISUALIZE NT Workstation",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
+                     DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
+                     },
+         },
+        {
+         .callback = force_acpi_ht,
+         .ident = "Compaq Workstation W8000",
+         .matches = {
+                     DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
+                     DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
+                     },
+         },
+        {
+         .callback = force_acpi_ht,
+         .ident = "ASUS P4B266",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+                     DMI_MATCH(DMI_BOARD_NAME, "P4B266"),
+                     },
+         },
+        {
+         .callback = force_acpi_ht,
+         .ident = "ASUS P2B-DS",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+                     DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
+                     },
+         },
+        {
+         .callback = force_acpi_ht,
+         .ident = "ASUS CUR-DLS",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+                     DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
+                     },
+         },
+        {
+         .callback = force_acpi_ht,
+         .ident = "ABIT i440BX-W83977",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
+                     DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
+                     },
+         },
+        {
+         .callback = force_acpi_ht,
+         .ident = "IBM Bladecenter",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+                     DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
+                     },
+         },
+        {
+         .callback = force_acpi_ht,
+         .ident = "IBM eServer xSeries 360",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+                     DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
+                     },
+         },
+        {
+         .callback = force_acpi_ht,
+         .ident = "IBM eserver xSeries 330",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+                     DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
+                     },
+         },
+        {
+         .callback = force_acpi_ht,
+         .ident = "IBM eserver xSeries 440",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+                     DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
+                     },
+         },
+        /*
+         * Boxes that need ACPI PCI IRQ routing disabled
+         */
+        {
+         .callback = disable_acpi_irq,
+         .ident = "ASUS A7V",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"),
+                     DMI_MATCH(DMI_BOARD_NAME, "<A7V>"),
+                     /* newer BIOS, Revision 1011, does work */
+                     DMI_MATCH(DMI_BIOS_VERSION,
+                               "ASUS A7V ACPI BIOS Revision 1007"),
+                     },
+         },
+        {
+                /*
+                 * Latest BIOS for IBM 600E (1.16) has bad pcinum
+                 * for LPC bridge, which is needed for the PCI
+                 * interrupt links to work. DSDT fix is in bug 5966.
+                 * 2645, 2646 model numbers are shared with 600/600E/600X
+                 */
+         .callback = disable_acpi_irq,
+         .ident = "IBM Thinkpad 600 Series 2645",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+                     DMI_MATCH(DMI_BOARD_NAME, "2645"),
+                     },
+         },
+        {
+         .callback = disable_acpi_irq,
+         .ident = "IBM Thinkpad 600 Series 2646",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+                     DMI_MATCH(DMI_BOARD_NAME, "2646"),
+                     },
+         },
+        /*
+         * Boxes that need ACPI PCI IRQ routing and PCI scan disabled
+         */
+        {                       /* _BBN 0 bug */
+         .callback = disable_acpi_pci,
+         .ident = "ASUS PR-DLS",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+                     DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"),
+                     DMI_MATCH(DMI_BIOS_VERSION,
+                               "ASUS PR-DLS ACPI BIOS Revision 1010"),
+                     DMI_MATCH(DMI_BIOS_DATE, "03/21/2003")
+                     },
+         },
+        {
+         .callback = disable_acpi_pci,
+         .ident = "Acer TravelMate 36x Laptop",
+         .matches = {
+                     DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+                     DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
+                     },
+         },
+        {}
+};
+#endif                          /* __i386__ */
+/*
+ * acpi_boot_table_init() and acpi_boot_init()
+ *  called from setup_arch(), always.
+ *      1. checksums all tables
+ *      2. enumerates lapics
+ *      3. enumerates io-apics
+ *
+ * acpi_table_init() is separate to allow reading SRAT without
+ * other side effects.
+ *
+ * side effects of acpi_boot_init:
+ *      acpi_lapic = 1 if LAPIC found
+ *      acpi_ioapic = 1 if IOAPIC found
+ *      if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
+ *      if acpi_blacklisted() acpi_disabled = 1;
+ *      acpi_irq_model=...
+ *      ...
+ *
+ * return value: (currently ignored)
+ *      0: success
+ *      !0: failure
+ */
+int __init acpi_boot_table_init(void)
+{
+        int error;
+#ifdef __i386__
+        dmi_check_system(acpi_dmi_table);
+#endif
+        /*
+         * If acpi_disabled, bail out
+         * One exception: acpi=ht continues far enough to enumerate LAPICs
+         */
+        if (acpi_disabled && !acpi_ht)
+                return 1;
+        /*
+         * Initialize the ACPI boot-time table parser.
+         */
+        error = acpi_table_init();
+        if (error) {
+                disable_acpi();
+                return error;
+        }
+        acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
+        /*
+         * blacklist may disable ACPI entirely
+         */
+        error = acpi_blacklisted();
+        if (error) {
+                if (acpi_force) {
+                        printk(KERN_WARNING PREFIX "acpi=force override\n");
+                } else {
+                        printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
+                        disable_acpi();
+                        return error;
+                }
+        }
+        return 0;
+}
+int __init acpi_boot_init(void)
+{
+        /*
+         * If acpi_disabled, bail out
+         * One exception: acpi=ht continues far enough to enumerate LAPICs
+         */
+        if (acpi_disabled && !acpi_ht)
+                return 1;
+        acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
+        /*
+         * set sci_int and PM timer address
+         */
+        acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt);
+        /*
+         * Process the Multiple APIC Description Table (MADT), if present
+         */
+        acpi_process_madt();
+        acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
+        return 0;
+}
+static int __init parse_acpi(char *arg)
+{
+        if (!arg)
+                return -EINVAL;
+        /* "acpi=off" disables both ACPI table parsing and interpreter */
+        if (strcmp(arg, "off") == 0) {
+                disable_acpi();
+        }
+        /* acpi=force to over-ride black-list */
+        else if (strcmp(arg, "force") == 0) {
+                acpi_force = 1;
+                acpi_ht = 1;
+                acpi_disabled = 0;
+        }
+        /* acpi=strict disables out-of-spec workarounds */
+        else if (strcmp(arg, "strict") == 0) {
+                acpi_strict = 1;
+        }
+        /* Limit ACPI just to boot-time to enable HT */
+        else if (strcmp(arg, "ht") == 0) {
+                if (!acpi_force)
+                        disable_acpi();
+                acpi_ht = 1;
+        }
+        /* "acpi=noirq" disables ACPI interrupt routing */
+        else if (strcmp(arg, "noirq") == 0) {
+                acpi_noirq_set();
+        } else {
+                /* Core will printk when we return error. */
+                return -EINVAL;
+        }
+        return 0;
+}
+early_param("acpi", parse_acpi);
+/* FIXME: Using pci= for an ACPI parameter is a travesty. */
+static int __init parse_pci(char *arg)
+{
+        if (arg && strcmp(arg, "noacpi") == 0)
+                acpi_disable_pci();
+        return 0;
+}
+early_param("pci", parse_pci);
+#ifdef CONFIG_X86_IO_APIC
+static int __init parse_acpi_skip_timer_override(char *arg)
+{
+        acpi_skip_timer_override = 1;
+        return 0;
+}
+early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override);
+static int __init parse_acpi_use_timer_override(char *arg)
+{
+        acpi_use_timer_override = 1;
+        return 0;
+}
+early_param("acpi_use_timer_override", parse_acpi_use_timer_override);
+#endif /* CONFIG_X86_IO_APIC */
+static int __init setup_acpi_sci(char *s)
+{
+        if (!s)
+                return -EINVAL;
+        if (!strcmp(s, "edge"))
+                acpi_sci_flags =  ACPI_MADT_TRIGGER_EDGE |
+                        (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK);
+        else if (!strcmp(s, "level"))
+                acpi_sci_flags = ACPI_MADT_TRIGGER_LEVEL |
+                        (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK);
+        else if (!strcmp(s, "high"))
+                acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_HIGH |
+                        (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK);
+        else if (!strcmp(s, "low"))
+                acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_LOW |
+                        (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK);
+        else
+                return -EINVAL;
+        return 0;
+}
+early_param("acpi_sci", setup_acpi_sci);
+int __acpi_acquire_global_lock(unsigned int *lock)
+{
+        unsigned int old, new, val;
+        do {
+                old = *lock;
+                new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
+                val = cmpxchg(lock, old, new);
+        } while (unlikely (val != old));
+        return (new < 3) ? -1 : 0;
+}
+int __acpi_release_global_lock(unsigned int *lock)
+{
+        unsigned int old, new, val;
+        do {
+                old = *lock;
+                new = old & ~0x3;
+                val = cmpxchg(lock, old, new);
+        } while (unlikely (val != old));
+        return old & 0x1;
+}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
new file mode 100644
index 000000000000..2d39f55d29a8
--- /dev/null
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -0,0 +1,164 @@
+/*
+ * arch/i386/kernel/acpi/cstate.c
+ *
+ * Copyright (C) 2005 Intel Corporation
+ *      Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *      - Added _PDC for SMP C-states on Intel CPUs
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <acpi/processor.h>
+#include <asm/acpi.h>
+/*
+ * Initialize bm_flags based on the CPU cache properties
+ * On SMP it depends on cache configuration
+ * - When cache is not shared among all CPUs, we flush cache
+ *   before entering C3.
+ * - When cache is shared among all CPUs, we use bm_check
+ *   mechanism as in UP case
+ *
+ * This routine is called only after all the CPUs are online
+ */
+void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
+                                        unsigned int cpu)
+{
+        struct cpuinfo_x86 *c = cpu_data + cpu;
+        flags->bm_check = 0;
+        if (num_online_cpus() == 1)
+                flags->bm_check = 1;
+        else if (c->x86_vendor == X86_VENDOR_INTEL) {
+                /*
+                 * Today all CPUs that support C3 share cache.
+                 * TBD: This needs to look at cache shared map, once
+                 * multi-core detection patch makes to the base.
+                 */
+                flags->bm_check = 1;
+        }
+}
+EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
+/* The code below handles cstate entry with monitor-mwait pair on Intel*/
+struct cstate_entry {
+        struct {
+                unsigned int eax;
+                unsigned int ecx;
+        } states[ACPI_PROCESSOR_MAX_POWER];
+};
+static struct cstate_entry *cpu_cstate_entry;   /* per CPU ptr */
+static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
+#define MWAIT_SUBSTATE_MASK     (0xf)
+#define MWAIT_SUBSTATE_SIZE     (4)
+#define CPUID_MWAIT_LEAF (5)
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
+#define CPUID5_ECX_INTERRUPT_BREAK      (0x2)
+#define MWAIT_ECX_INTERRUPT_BREAK       (0x1)
+#define NATIVE_CSTATE_BEYOND_HALT       (2)
+int acpi_processor_ffh_cstate_probe(unsigned int cpu,
+                struct acpi_processor_cx *cx, struct acpi_power_register *reg)
+{
+        struct cstate_entry *percpu_entry;
+        struct cpuinfo_x86 *c = cpu_data + cpu;
+        cpumask_t saved_mask;
+        int retval;
+        unsigned int eax, ebx, ecx, edx;
+        unsigned int edx_part;
+        unsigned int cstate_type; /* C-state type and not ACPI C-state type */
+        unsigned int num_cstate_subtype;
+        if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF )
+                return -1;
+        if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
+                return -1;
+        percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
+        percpu_entry->states[cx->index].eax = 0;
+        percpu_entry->states[cx->index].ecx = 0;
+        /* Make sure we are running on right CPU */
+        saved_mask = current->cpus_allowed;
+        retval = set_cpus_allowed(current, cpumask_of_cpu(cpu));
+        if (retval)
+                return -1;
+        cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+        /* Check whether this particular cx_type (in CST) is supported or not */
+        cstate_type = (cx->address >> MWAIT_SUBSTATE_SIZE) + 1;
+        edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
+        num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
+        retval = 0;
+        if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) {
+                retval = -1;
+                goto out;
+        }
+        /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
+        if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+            !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) {
+                retval = -1;
+                goto out;
+        }
+        percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
+        /* Use the hint in CST */
+        percpu_entry->states[cx->index].eax = cx->address;
+        if (!mwait_supported[cstate_type]) {
+                mwait_supported[cstate_type] = 1;
+                printk(KERN_DEBUG "Monitor-Mwait will be used to enter C-%d "
+                       "state\n", cx->type);
+        }
+out:
+        set_cpus_allowed(current, saved_mask);
+        return retval;
+}
+EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
+void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+{
+        unsigned int cpu = smp_processor_id();
+        struct cstate_entry *percpu_entry;
+        percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
+        mwait_idle_with_hints(percpu_entry->states[cx->index].eax,
+                              percpu_entry->states[cx->index].ecx);
+}
+EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter);
+static int __init ffh_cstate_init(void)
+{
+        struct cpuinfo_x86 *c = &boot_cpu_data;
+        if (c->x86_vendor != X86_VENDOR_INTEL)
+                return -1;
+        cpu_cstate_entry = alloc_percpu(struct cstate_entry);
+        return 0;
+}
+static void __exit ffh_cstate_exit(void)
+{
+        free_percpu(cpu_cstate_entry);
+        cpu_cstate_entry = NULL;
+}
+arch_initcall(ffh_cstate_init);
+__exitcall(ffh_cstate_exit);
diff --git a/arch/x86/kernel/acpi/earlyquirk_32.c b/arch/x86/kernel/acpi/earlyquirk_32.c
new file mode 100644
index 000000000000..23f78efc577d
--- /dev/null
+++ b/arch/x86/kernel/acpi/earlyquirk_32.c
@@ -0,0 +1,84 @@
+/* 
+ * Do early PCI probing for bug detection when the main PCI subsystem is 
+ * not up yet.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <asm/pci-direct.h>
+#include <asm/acpi.h>
+#include <asm/apic.h>
+#ifdef CONFIG_ACPI
+static int __init nvidia_hpet_check(struct acpi_table_header *header)
+{
+        return 0;
+}
+#endif
+static int __init check_bridge(int vendor, int device)
+{
+#ifdef CONFIG_ACPI
+        static int warned;
+        /* According to Nvidia all timer overrides are bogus unless HPET
+           is enabled. */
+        if (!acpi_use_timer_override && vendor == PCI_VENDOR_ID_NVIDIA) {
+                if (!warned && acpi_table_parse(ACPI_SIG_HPET,
+                                                nvidia_hpet_check)) {
+                        warned = 1;
+                        acpi_skip_timer_override = 1;
+                          printk(KERN_INFO "Nvidia board "
+                       "detected. Ignoring ACPI "
+                       "timer override.\n");
+                printk(KERN_INFO "If you got timer trouble "
+                                 "try acpi_use_timer_override\n");
+                }
+        }
+#endif
+        if (vendor == PCI_VENDOR_ID_ATI && timer_over_8254 == 1) {
+                timer_over_8254 = 0;
+                printk(KERN_INFO "ATI board detected. Disabling timer routing "
+                                "over 8254.\n");
+        }
+        return 0;
+}
+void __init check_acpi_pci(void)
+{
+        int num, slot, func;
+        /* Assume the machine supports type 1. If not it will 
+           always read ffffffff and should not have any side effect.
+           Actually a few buggy systems can machine check. Allow the user
+           to disable it by command line option at least -AK */
+        if (!early_pci_allowed())
+                return;
+        /* Poor man's PCI discovery */
+        for (num = 0; num < 32; num++) {
+                for (slot = 0; slot < 32; slot++) {
+                        for (func = 0; func < 8; func++) {
+                                u32 class;
+                                u32 vendor;
+                                class = read_pci_config(num, slot, func,
+                                                        PCI_CLASS_REVISION);
+                                if (class == 0xffffffff)
+                                        break;
+                                if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
+                                        continue;
+                                vendor = read_pci_config(num, slot, func,
+                                                         PCI_VENDOR_ID);
+                                if (check_bridge(vendor & 0xffff, vendor >> 16))
+                                        return;
+                        }
+                }
+        }
+}
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
new file mode 100644
index 000000000000..b54fded49834
--- /dev/null
+++ b/arch/x86/kernel/acpi/processor.c
@@ -0,0 +1,75 @@
+/*
+ * arch/i386/kernel/acpi/processor.c
+ *
+ * Copyright (C) 2005 Intel Corporation
+ *      Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *      - Added _PDC for platforms with Intel CPUs
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <acpi/processor.h>
+#include <asm/acpi.h>
+static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
+{
+        struct acpi_object_list *obj_list;
+        union acpi_object *obj;
+        u32 *buf;
+        /* allocate and initialize pdc. It will be used later. */
+        obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL);
+        if (!obj_list) {
+                printk(KERN_ERR "Memory allocation error\n");
+                return;
+        }
+        obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL);
+        if (!obj) {
+                printk(KERN_ERR "Memory allocation error\n");
+                kfree(obj_list);
+                return;
+        }
+        buf = kmalloc(12, GFP_KERNEL);
+        if (!buf) {
+                printk(KERN_ERR "Memory allocation error\n");
+                kfree(obj);
+                kfree(obj_list);
+                return;
+        }
+        buf[0] = ACPI_PDC_REVISION_ID;
+        buf[1] = 1;
+        buf[2] = ACPI_PDC_C_CAPABILITY_SMP;
+        if (cpu_has(c, X86_FEATURE_EST))
+                buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
+        obj->type = ACPI_TYPE_BUFFER;
+        obj->buffer.length = 12;
+        obj->buffer.pointer = (u8 *) buf;
+        obj_list->count = 1;
+        obj_list->pointer = obj;
+        pr->pdc = obj_list;
+        return;
+}
+/* Initialize _PDC data based on the CPU vendor */
+void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
+{
+        unsigned int cpu = pr->id;
+        struct cpuinfo_x86 *c = cpu_data + cpu;
+        pr->pdc = NULL;
+        if (c->x86_vendor == X86_VENDOR_INTEL)
+                init_intel_pdc(pr, c);
+        return;
+}
+EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
diff --git a/arch/x86/kernel/acpi/sleep_32.c b/arch/x86/kernel/acpi/sleep_32.c
new file mode 100644
index 000000000000..10699489cfe7
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep_32.c
@@ -0,0 +1,110 @@
+/*
+ * sleep.c - x86-specific ACPI sleep support.
+ *
+ *  Copyright (C) 2001-2003 Patrick Mochel
+ *  Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
+ */
+#include <linux/acpi.h>
+#include <linux/bootmem.h>
+#include <linux/dmi.h>
+#include <linux/cpumask.h>
+#include <asm/smp.h>
+/* address in low memory of the wakeup routine. */
+unsigned long acpi_wakeup_address = 0;
+unsigned long acpi_realmode_flags;
+extern char wakeup_start, wakeup_end;
+extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
+/**
+ * acpi_save_state_mem - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ */
+int acpi_save_state_mem(void)
+{
+        if (!acpi_wakeup_address)
+                return 1;
+        memcpy((void *)acpi_wakeup_address, &wakeup_start,
+               &wakeup_end - &wakeup_start);
+        acpi_copy_wakeup_routine(acpi_wakeup_address);
+        return 0;
+}
+/*
+ * acpi_restore_state - undo effects of acpi_save_state_mem
+ */
+void acpi_restore_state_mem(void)
+{
+}
+/**
+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
+ *
+ * We allocate a page from the first 1MB of memory for the wakeup
+ * routine for when we come back from a sleep state. The
+ * runtime allocator allows specification of <16MB pages, but not
+ * <1MB pages.
+ */
+void __init acpi_reserve_bootmem(void)
+{
+        if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
+                printk(KERN_ERR
+                       "ACPI: Wakeup code way too big, S3 disabled.\n");
+                return;
+        }
+        acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
+        if (!acpi_wakeup_address)
+                printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
+}
+static int __init acpi_sleep_setup(char *str)
+{
+        while ((str != NULL) && (*str != '\0')) {
+                if (strncmp(str, "s3_bios", 7) == 0)
+                        acpi_realmode_flags |= 1;
+                if (strncmp(str, "s3_mode", 7) == 0)
+                        acpi_realmode_flags |= 2;
+                if (strncmp(str, "s3_beep", 7) == 0)
+                        acpi_realmode_flags |= 4;
+                str = strchr(str, ',');
+                if (str != NULL)
+                        str += strspn(str, ", \t");
+        }
+        return 1;
+}
+__setup("acpi_sleep=", acpi_sleep_setup);
+/* Ouch, we want to delete this. We already have better version in userspace, in
+   s2ram from suspend.sf.net project */
+static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
+{
+        acpi_realmode_flags |= 2;
+        return 0;
+}
+static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
+        {                       /* Reset video mode after returning from ACPI S3 sleep */
+         .callback = reset_videomode_after_s3,
+         .ident = "Toshiba Satellite 4030cdt",
+         .matches = {
+                     DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
+                     },
+         },
+        {}
+};
+static int __init acpisleep_dmi_init(void)
+{
+        dmi_check_system(acpisleep_dmi_table);
+        return 0;
+}
+core_initcall(acpisleep_dmi_init);
diff --git a/arch/x86/kernel/acpi/sleep_64.c b/arch/x86/kernel/acpi/sleep_64.c
new file mode 100644
index 000000000000..79475d237071
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep_64.c
@@ -0,0 +1,120 @@
+/*
+ *  acpi.c - Architecture-Specific Low-Level ACPI Support
+ *
+ *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
+ *  Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
+ *  Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
+ *  Copyright (C) 2003 Pavel Machek, SuSE Labs
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/bootmem.h>
+#include <linux/acpi.h>
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+#include <asm/tlbflush.h>
+/* --------------------------------------------------------------------------
+                              Low-Level Sleep Support
+   -------------------------------------------------------------------------- */
+/* address in low memory of the wakeup routine. */
+unsigned long acpi_wakeup_address = 0;
+unsigned long acpi_realmode_flags;
+extern char wakeup_start, wakeup_end;
+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
+/**
+ * acpi_save_state_mem - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ */
+int acpi_save_state_mem(void)
+{
+        memcpy((void *)acpi_wakeup_address, &wakeup_start,
+               &wakeup_end - &wakeup_start);
+        acpi_copy_wakeup_routine(acpi_wakeup_address);
+        return 0;
+}
+/*
+ * acpi_restore_state
+ */
+void acpi_restore_state_mem(void)
+{
+}
+/**
+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
+ *
+ * We allocate a page in low memory for the wakeup
+ * routine for when we come back from a sleep state. The
+ * runtime allocator allows specification of <16M pages, but not
+ * <1M pages.
+ */
+void __init acpi_reserve_bootmem(void)
+{
+        acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
+        if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
+                printk(KERN_CRIT
+                       "ACPI: Wakeup code way too big, will crash on attempt"
+                       " to suspend\n");
+}
+static int __init acpi_sleep_setup(char *str)
+{
+        while ((str != NULL) && (*str != '\0')) {
+                if (strncmp(str, "s3_bios", 7) == 0)
+                        acpi_realmode_flags |= 1;
+                if (strncmp(str, "s3_mode", 7) == 0)
+                        acpi_realmode_flags |= 2;
+                if (strncmp(str, "s3_beep", 7) == 0)
+                        acpi_realmode_flags |= 4;
+                str = strchr(str, ',');
+                if (str != NULL)
+                        str += strspn(str, ", \t");
+        }
+        return 1;
+}
+__setup("acpi_sleep=", acpi_sleep_setup);
+void acpi_pci_link_exit(void)
+{
+}
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
new file mode 100644
index 000000000000..f22ba8534d26
--- /dev/null
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -0,0 +1,321 @@
+.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#
+# wakeup_code runs in real mode, and at unknown address (determined at run-time).
+# Therefore it must only use relative jumps/calls. 
+#
+# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled
+#
+# If physical address of wakeup_code is 0x12345, BIOS should call us with
+# cs = 0x1234, eip = 0x05
+# 
+#define BEEP \
+        inb     $97, %al;       \
+        outb    %al, $0x80;     \
+        movb    $3, %al;        \
+        outb    %al, $97;       \
+        outb    %al, $0x80;     \
+        movb    $-74, %al;      \
+        outb    %al, $67;       \
+        outb    %al, $0x80;     \
+        movb    $-119, %al;     \
+        outb    %al, $66;       \
+        outb    %al, $0x80;     \
+        movb    $15, %al;       \
+        outb    %al, $66;
+ALIGN
+        .align  4096
+ENTRY(wakeup_start)
+wakeup_code:
+        wakeup_code_start = .
+        .code16
+        movw    $0xb800, %ax
+        movw    %ax,%fs
+        movw    $0x0e00 + 'L', %fs:(0x10)
+        cli
+        cld
+        # setup data segment
+        movw    %cs, %ax
+        movw    %ax, %ds                                        # Make ds:0 point to wakeup_start
+        movw    %ax, %ss
+        testl   $4, realmode_flags - wakeup_code
+        jz      1f
+        BEEP
+1:
+        mov     $(wakeup_stack - wakeup_code), %sp              # Private stack is needed for ASUS board
+        movw    $0x0e00 + 'S', %fs:(0x12)
+        pushl   $0                                              # Kill any dangerous flags
+        popfl
+        movl    real_magic - wakeup_code, %eax
+        cmpl    $0x12345678, %eax
+        jne     bogus_real_magic
+        testl   $1, realmode_flags - wakeup_code
+        jz      1f
+        lcall   $0xc000,$3
+        movw    %cs, %ax
+        movw    %ax, %ds                                        # Bios might have played with that
+        movw    %ax, %ss
+1:
+        testl   $2, realmode_flags - wakeup_code
+        jz      1f
+        mov     video_mode - wakeup_code, %ax
+        call    mode_set
+1:
+        # set up page table
+        movl    $swsusp_pg_dir-__PAGE_OFFSET, %eax
+        movl    %eax, %cr3
+        testl   $1, real_efer_save_restore - wakeup_code
+        jz      4f
+        # restore efer setting
+        movl    real_save_efer_edx - wakeup_code, %edx
+        movl    real_save_efer_eax - wakeup_code, %eax
+        mov     $0xc0000080, %ecx
+        wrmsr
+4:
+        # make sure %cr4 is set correctly (features, etc)
+        movl    real_save_cr4 - wakeup_code, %eax
+        movl    %eax, %cr4
+        movw    $0xb800, %ax
+        movw    %ax,%fs
+        movw    $0x0e00 + 'i', %fs:(0x12)
+        
+        # need a gdt -- use lgdtl to force 32-bit operands, in case
+        # the GDT is located past 16 megabytes.
+        lgdtl   real_save_gdt - wakeup_code
+        movl    real_save_cr0 - wakeup_code, %eax
+        movl    %eax, %cr0
+        jmp 1f
+1:
+        movw    $0x0e00 + 'n', %fs:(0x14)
+        movl    real_magic - wakeup_code, %eax
+        cmpl    $0x12345678, %eax
+        jne     bogus_real_magic
+        testl   $8, realmode_flags - wakeup_code
+        jz      1f
+        BEEP
+1:
+        ljmpl   $__KERNEL_CS, $wakeup_pmode_return
+real_save_gdt:  .word 0
+                .long 0
+real_save_cr0:  .long 0
+real_save_cr3:  .long 0
+real_save_cr4:  .long 0
+real_magic:     .long 0
+video_mode:     .long 0
+realmode_flags: .long 0
+beep_flags:     .long 0
+real_efer_save_restore: .long 0
+real_save_efer_edx:     .long 0
+real_save_efer_eax:     .long 0
+bogus_real_magic:
+        movw    $0x0e00 + 'B', %fs:(0x12)
+        jmp bogus_real_magic
+/* This code uses an extended set of video mode numbers. These include:
+ * Aliases for standard modes
+ *      NORMAL_VGA (-1)
+ *      EXTENDED_VGA (-2)
+ *      ASK_VGA (-3)
+ * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
+ * of compatibility when extending the table. These are between 0x00 and 0xff.
+ */
+#define VIDEO_FIRST_MENU 0x0000
+/* Standard BIOS video modes (BIOS number + 0x0100) */
+#define VIDEO_FIRST_BIOS 0x0100
+/* VESA BIOS video modes (VESA number + 0x0200) */
+#define VIDEO_FIRST_VESA 0x0200
+/* Video7 special modes (BIOS number + 0x0900) */
+#define VIDEO_FIRST_V7 0x0900
+# Setting of user mode (AX=mode ID) => CF=success
+# For now, we only handle VESA modes (0x0200..0x03ff).  To handle other
+# modes, we should probably compile in the video code from the boot
+# directory.
+mode_set:
+        movw    %ax, %bx
+        subb    $VIDEO_FIRST_VESA>>8, %bh
+        cmpb    $2, %bh
+        jb      check_vesa
+setbad:
+        clc
+        ret
+check_vesa:
+        orw     $0x4000, %bx                    # Use linear frame buffer
+        movw    $0x4f02, %ax                    # VESA BIOS mode set call
+        int     $0x10
+        cmpw    $0x004f, %ax                    # AL=4f if implemented
+        jnz     setbad                          # AH=0 if OK
+        stc
+        ret
+        .code32
+        ALIGN
+.org    0x800
+wakeup_stack_begin:     # Stack grows down
+.org    0xff0           # Just below end of page
+wakeup_stack:
+ENTRY(wakeup_end)
+        
+.org    0x1000
+wakeup_pmode_return:
+        movw    $__KERNEL_DS, %ax
+        movw    %ax, %ss
+        movw    %ax, %ds
+        movw    %ax, %es
+        movw    %ax, %fs
+        movw    %ax, %gs
+        movw    $0x0e00 + 'u', 0xb8016
+        # reload the gdt, as we need the full 32 bit address
+        lgdt    saved_gdt
+        lidt    saved_idt
+        lldt    saved_ldt
+        ljmp    $(__KERNEL_CS),$1f
+1:
+        movl    %cr3, %eax
+        movl    %eax, %cr3
+        wbinvd
+        # and restore the stack ... but you need gdt for this to work
+        movl    saved_context_esp, %esp
+        movl    %cs:saved_magic, %eax
+        cmpl    $0x12345678, %eax
+        jne     bogus_magic
+        # jump to place where we left off
+        movl    saved_eip,%eax
+        jmp     *%eax
+bogus_magic:
+        movw    $0x0e00 + 'B', 0xb8018
+        jmp     bogus_magic
+##
+# acpi_copy_wakeup_routine
+#
+# Copy the above routine to low memory.
+#
+# Parameters:
+# %eax: place to copy wakeup routine to
+#
+# Returned address is location of code in low memory (past data and stack)
+#
+ENTRY(acpi_copy_wakeup_routine)
+        pushl   %ebx
+        sgdt    saved_gdt
+        sidt    saved_idt
+        sldt    saved_ldt
+        str     saved_tss
+        movl    nx_enabled, %edx
+        movl    %edx, real_efer_save_restore - wakeup_start (%eax)
+        testl   $1, real_efer_save_restore - wakeup_start (%eax)
+        jz      2f
+        # save efer setting
+        pushl   %eax
+        movl    %eax, %ebx
+        mov     $0xc0000080, %ecx
+        rdmsr
+        movl    %edx, real_save_efer_edx - wakeup_start (%ebx)
+        movl    %eax, real_save_efer_eax - wakeup_start (%ebx)
+        popl    %eax
+2:
+        movl    %cr3, %edx
+        movl    %edx, real_save_cr3 - wakeup_start (%eax)
+        movl    %cr4, %edx
+        movl    %edx, real_save_cr4 - wakeup_start (%eax)
+        movl    %cr0, %edx
+        movl    %edx, real_save_cr0 - wakeup_start (%eax)
+        sgdt    real_save_gdt - wakeup_start (%eax)
+        movl    saved_videomode, %edx
+        movl    %edx, video_mode - wakeup_start (%eax)
+        movl    acpi_realmode_flags, %edx
+        movl    %edx, realmode_flags - wakeup_start (%eax)
+        movl    $0x12345678, real_magic - wakeup_start (%eax)
+        movl    $0x12345678, saved_magic
+        popl    %ebx
+        ret
+save_registers:
+        leal    4(%esp), %eax
+        movl    %eax, saved_context_esp
+        movl %ebx, saved_context_ebx
+        movl %ebp, saved_context_ebp
+        movl %esi, saved_context_esi
+        movl %edi, saved_context_edi
+        pushfl ; popl saved_context_eflags
+        movl $ret_point, saved_eip
+        ret
+restore_registers:
+        movl saved_context_ebp, %ebp
+        movl saved_context_ebx, %ebx
+        movl saved_context_esi, %esi
+        movl saved_context_edi, %edi
+        pushl saved_context_eflags ; popfl
+        ret     
+ENTRY(do_suspend_lowlevel)
+        call    save_processor_state
+        call    save_registers
+        pushl   $3
+        call    acpi_enter_sleep_state
+        addl    $4, %esp
+#       In case of S3 failure, we'll emerge here.  Jump
+#       to ret_point to recover
+        jmp     ret_point
+        .p2align 4,,7
+ret_point:
+        call    restore_registers
+        call    restore_processor_state
+        ret
+.data
+ALIGN
+ENTRY(saved_magic)      .long   0
+ENTRY(saved_eip)        .long   0
+# saved registers
+saved_gdt:      .long   0,0
+saved_idt:      .long   0,0
+saved_ldt:      .long   0
+saved_tss:      .long   0
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
new file mode 100644
index 000000000000..8b4357e1efe0
--- /dev/null
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -0,0 +1,456 @@
+.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+#
+# wakeup_code runs in real mode, and at unknown address (determined at run-time).
+# Therefore it must only use relative jumps/calls. 
+#
+# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled
+#
+# If physical address of wakeup_code is 0x12345, BIOS should call us with
+# cs = 0x1234, eip = 0x05
+#
+#define BEEP \
+        inb     $97, %al;       \
+        outb    %al, $0x80;     \
+        movb    $3, %al;        \
+        outb    %al, $97;       \
+        outb    %al, $0x80;     \
+        movb    $-74, %al;      \
+        outb    %al, $67;       \
+        outb    %al, $0x80;     \
+        movb    $-119, %al;     \
+        outb    %al, $66;       \
+        outb    %al, $0x80;     \
+        movb    $15, %al;       \
+        outb    %al, $66;
+ALIGN
+        .align  16
+ENTRY(wakeup_start)
+wakeup_code:
+        wakeup_code_start = .
+        .code16
+# Running in *copy* of this code, somewhere in low 1MB.
+        movb    $0xa1, %al      ;  outb %al, $0x80
+        cli
+        cld
+        # setup data segment
+        movw    %cs, %ax
+        movw    %ax, %ds                # Make ds:0 point to wakeup_start
+        movw    %ax, %ss
+        # Data segment must be set up before we can see whether to beep.
+        testl   $4, realmode_flags - wakeup_code
+        jz      1f
+        BEEP
+1:
+                                        # Private stack is needed for ASUS board
+        mov     $(wakeup_stack - wakeup_code), %sp
+        pushl   $0                      # Kill any dangerous flags
+        popfl
+        movl    real_magic - wakeup_code, %eax
+        cmpl    $0x12345678, %eax
+        jne     bogus_real_magic
+        call    verify_cpu                      # Verify the cpu supports long
+                                                # mode
+        testl   %eax, %eax
+        jnz     no_longmode
+        testl   $1, realmode_flags - wakeup_code
+        jz      1f
+        lcall   $0xc000,$3
+        movw    %cs, %ax
+        movw    %ax, %ds                # Bios might have played with that
+        movw    %ax, %ss
+1:
+        testl   $2, realmode_flags - wakeup_code
+        jz      1f
+        mov     video_mode - wakeup_code, %ax
+        call    mode_set
+1:
+        movw    $0xb800, %ax
+        movw    %ax,%fs
+        movw    $0x0e00 + 'L', %fs:(0x10)
+        movb    $0xa2, %al      ;  outb %al, $0x80
+        
+        mov     %ds, %ax                        # Find 32bit wakeup_code addr
+        movzx   %ax, %esi                       # (Convert %ds:gdt to a liner ptr)
+        shll    $4, %esi
+                                                # Fix up the vectors
+        addl    %esi, wakeup_32_vector - wakeup_code
+        addl    %esi, wakeup_long64_vector - wakeup_code
+        addl    %esi, gdt_48a + 2 - wakeup_code # Fixup the gdt pointer
+        lidtl   %ds:idt_48a - wakeup_code
+        lgdtl   %ds:gdt_48a - wakeup_code       # load gdt with whatever is
+                                                # appropriate
+        movl    $1, %eax                        # protected mode (PE) bit
+        lmsw    %ax                             # This is it!
+        jmp     1f
+1:
+        ljmpl   *(wakeup_32_vector - wakeup_code)
+        .balign 4
+wakeup_32_vector:
+        .long   wakeup_32 - wakeup_code
+        .word   __KERNEL32_CS, 0
+        .code32
+wakeup_32:
+# Running in this code, but at low address; paging is not yet turned on.
+        movb    $0xa5, %al      ;  outb %al, $0x80
+        movl    $__KERNEL_DS, %eax
+        movl    %eax, %ds
+        movw    $0x0e00 + 'i', %ds:(0xb8012)
+        movb    $0xa8, %al      ;  outb %al, $0x80;
+        /*
+         * Prepare for entering 64bits mode
+         */
+        /* Enable PAE */
+        xorl    %eax, %eax
+        btsl    $5, %eax
+        movl    %eax, %cr4
+        /* Setup early boot stage 4 level pagetables */
+        leal    (wakeup_level4_pgt - wakeup_code)(%esi), %eax
+        movl    %eax, %cr3
+        /* Check if nx is implemented */
+        movl    $0x80000001, %eax
+        cpuid
+        movl    %edx,%edi
+        /* Enable Long Mode */
+        xorl    %eax, %eax
+        btsl    $_EFER_LME, %eax
+        /* No Execute supported? */
+        btl     $20,%edi
+        jnc     1f
+        btsl    $_EFER_NX, %eax
+                                
+        /* Make changes effective */
+1:      movl    $MSR_EFER, %ecx
+        xorl    %edx, %edx
+        wrmsr
+        xorl    %eax, %eax
+        btsl    $31, %eax                       /* Enable paging and in turn activate Long Mode */
+        btsl    $0, %eax                        /* Enable protected mode */
+        /* Make changes effective */
+        movl    %eax, %cr0
+        /* At this point:
+                CR4.PAE must be 1
+                CS.L must be 0
+                CR3 must point to PML4
+                Next instruction must be a branch
+                This must be on identity-mapped page
+        */
+        /*
+         * At this point we're in long mode but in 32bit compatibility mode
+         * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
+         * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load
+         * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+         */
+        /* Finally jump in 64bit mode */
+        ljmp    *(wakeup_long64_vector - wakeup_code)(%esi)
+        .balign 4
+wakeup_long64_vector:
+        .long   wakeup_long64 - wakeup_code
+        .word   __KERNEL_CS, 0
+.code64
+        /* Hooray, we are in Long 64-bit mode (but still running in
+         * low memory)
+         */
+wakeup_long64:
+        /*
+         * We must switch to a new descriptor in kernel space for the GDT
+         * because soon the kernel won't have access anymore to the userspace
+         * addresses where we're currently running on. We have to do that here
+         * because in 32bit we couldn't load a 64bit linear address.
+         */
+        lgdt    cpu_gdt_descr
+        movw    $0x0e00 + 'n', %ds:(0xb8014)
+        movb    $0xa9, %al      ;  outb %al, $0x80
+        movq    saved_magic, %rax
+        movq    $0x123456789abcdef0, %rdx
+        cmpq    %rdx, %rax
+        jne     bogus_64_magic
+        movw    $0x0e00 + 'u', %ds:(0xb8016)
+        
+        nop
+        nop
+        movw    $__KERNEL_DS, %ax
+        movw    %ax, %ss        
+        movw    %ax, %ds
+        movw    %ax, %es
+        movw    %ax, %fs
+        movw    %ax, %gs
+        movq    saved_rsp, %rsp
+        movw    $0x0e00 + 'x', %ds:(0xb8018)
+        movq    saved_rbx, %rbx
+        movq    saved_rdi, %rdi
+        movq    saved_rsi, %rsi
+        movq    saved_rbp, %rbp
+        movw    $0x0e00 + '!', %ds:(0xb801a)
+        movq    saved_rip, %rax
+        jmp     *%rax
+.code32
+        .align  64      
+gdta:
+        /* Its good to keep gdt in sync with one in trampoline.S */
+        .word   0, 0, 0, 0                      # dummy
+        /* ??? Why I need the accessed bit set in order for this to work? */
+        .quad   0x00cf9b000000ffff              # __KERNEL32_CS
+        .quad   0x00af9b000000ffff              # __KERNEL_CS
+        .quad   0x00cf93000000ffff              # __KERNEL_DS
+idt_48a:
+        .word   0                               # idt limit = 0
+        .word   0, 0                            # idt base = 0L
+gdt_48a:
+        .word   0x800                           # gdt limit=2048,
+                                                #  256 GDT entries
+        .long   gdta - wakeup_code              # gdt base (relocated in later)
+        
+real_magic:     .quad 0
+video_mode:     .quad 0
+realmode_flags: .quad 0
+.code16
+bogus_real_magic:
+        movb    $0xba,%al       ;  outb %al,$0x80
+        jmp bogus_real_magic
+.code64
+bogus_64_magic:
+        movb    $0xb3,%al       ;  outb %al,$0x80
+        jmp bogus_64_magic
+.code16
+no_longmode:
+        movb    $0xbc,%al       ;  outb %al,$0x80
+        jmp no_longmode
+#include "../verify_cpu_64.S"
+        
+/* This code uses an extended set of video mode numbers. These include:
+ * Aliases for standard modes
+ *      NORMAL_VGA (-1)
+ *      EXTENDED_VGA (-2)
+ *      ASK_VGA (-3)
+ * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
+ * of compatibility when extending the table. These are between 0x00 and 0xff.
+ */
+#define VIDEO_FIRST_MENU 0x0000
+/* Standard BIOS video modes (BIOS number + 0x0100) */
+#define VIDEO_FIRST_BIOS 0x0100
+/* VESA BIOS video modes (VESA number + 0x0200) */
+#define VIDEO_FIRST_VESA 0x0200
+/* Video7 special modes (BIOS number + 0x0900) */
+#define VIDEO_FIRST_V7 0x0900
+# Setting of user mode (AX=mode ID) => CF=success
+# For now, we only handle VESA modes (0x0200..0x03ff).  To handle other
+# modes, we should probably compile in the video code from the boot
+# directory.
+.code16
+mode_set:
+        movw    %ax, %bx
+        subb    $VIDEO_FIRST_VESA>>8, %bh
+        cmpb    $2, %bh
+        jb      check_vesa
+setbad:
+        clc
+        ret
+check_vesa:
+        orw     $0x4000, %bx                    # Use linear frame buffer
+        movw    $0x4f02, %ax                    # VESA BIOS mode set call
+        int     $0x10
+        cmpw    $0x004f, %ax                    # AL=4f if implemented
+        jnz     setbad                          # AH=0 if OK
+        stc
+        ret
+wakeup_stack_begin:     # Stack grows down
+.org    0xff0
+wakeup_stack:           # Just below end of page
+.org   0x1000
+ENTRY(wakeup_level4_pgt)
+        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+        .fill   510,8,0
+        /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+        .quad   level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ENTRY(wakeup_end)
+        
+##
+# acpi_copy_wakeup_routine
+#
+# Copy the above routine to low memory.
+#
+# Parameters:
+# %rdi: place to copy wakeup routine to
+#
+# Returned address is location of code in low memory (past data and stack)
+#
+        .code64
+ENTRY(acpi_copy_wakeup_routine)
+        pushq   %rax
+        pushq   %rdx
+        movl    saved_video_mode, %edx
+        movl    %edx, video_mode - wakeup_start (,%rdi)
+        movl    acpi_realmode_flags, %edx
+        movl    %edx, realmode_flags - wakeup_start (,%rdi)
+        movq    $0x12345678, real_magic - wakeup_start (,%rdi)
+        movq    $0x123456789abcdef0, %rdx
+        movq    %rdx, saved_magic
+        movq    saved_magic, %rax
+        movq    $0x123456789abcdef0, %rdx
+        cmpq    %rdx, %rax
+        jne     bogus_64_magic
+        # restore the regs we used
+        popq    %rdx
+        popq    %rax
+ENTRY(do_suspend_lowlevel_s4bios)
+        ret
+        .align 2
+        .p2align 4,,15
+.globl do_suspend_lowlevel
+        .type   do_suspend_lowlevel,@function
+do_suspend_lowlevel:
+.LFB5:
+        subq    $8, %rsp
+        xorl    %eax, %eax
+        call    save_processor_state
+        movq %rsp, saved_context_esp(%rip)
+        movq %rax, saved_context_eax(%rip)
+        movq %rbx, saved_context_ebx(%rip)
+        movq %rcx, saved_context_ecx(%rip)
+        movq %rdx, saved_context_edx(%rip)
+        movq %rbp, saved_context_ebp(%rip)
+        movq %rsi, saved_context_esi(%rip)
+        movq %rdi, saved_context_edi(%rip)
+        movq %r8,  saved_context_r08(%rip)
+        movq %r9,  saved_context_r09(%rip)
+        movq %r10, saved_context_r10(%rip)
+        movq %r11, saved_context_r11(%rip)
+        movq %r12, saved_context_r12(%rip)
+        movq %r13, saved_context_r13(%rip)
+        movq %r14, saved_context_r14(%rip)
+        movq %r15, saved_context_r15(%rip)
+        pushfq ; popq saved_context_eflags(%rip)
+        movq    $.L97, saved_rip(%rip)
+        movq %rsp,saved_rsp
+        movq %rbp,saved_rbp
+        movq %rbx,saved_rbx
+        movq %rdi,saved_rdi
+        movq %rsi,saved_rsi
+        addq    $8, %rsp
+        movl    $3, %edi
+        xorl    %eax, %eax
+        jmp     acpi_enter_sleep_state
+.L97:
+        .p2align 4,,7
+.L99:
+        .align 4
+        movl    $24, %eax
+        movw %ax, %ds
+        movq    saved_context+58(%rip), %rax
+        movq %rax, %cr4
+        movq    saved_context+50(%rip), %rax
+        movq %rax, %cr3
+        movq    saved_context+42(%rip), %rax
+        movq %rax, %cr2
+        movq    saved_context+34(%rip), %rax
+        movq %rax, %cr0
+        pushq saved_context_eflags(%rip) ; popfq
+        movq saved_context_esp(%rip), %rsp
+        movq saved_context_ebp(%rip), %rbp
+        movq saved_context_eax(%rip), %rax
+        movq saved_context_ebx(%rip), %rbx
+        movq saved_context_ecx(%rip), %rcx
+        movq saved_context_edx(%rip), %rdx
+        movq saved_context_esi(%rip), %rsi
+        movq saved_context_edi(%rip), %rdi
+        movq saved_context_r08(%rip), %r8
+        movq saved_context_r09(%rip), %r9
+        movq saved_context_r10(%rip), %r10
+        movq saved_context_r11(%rip), %r11
+        movq saved_context_r12(%rip), %r12
+        movq saved_context_r13(%rip), %r13
+        movq saved_context_r14(%rip), %r14
+        movq saved_context_r15(%rip), %r15
+        xorl    %eax, %eax
+        addq    $8, %rsp
+        jmp     restore_processor_state
+.LFE5:
+.Lfe5:
+        .size   do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel
+        
+.data
+ALIGN
+ENTRY(saved_rbp)        .quad   0
+ENTRY(saved_rsi)        .quad   0
+ENTRY(saved_rdi)        .quad   0
+ENTRY(saved_rbx)        .quad   0
+ENTRY(saved_rip)        .quad   0
+ENTRY(saved_rsp)        .quad   0
+ENTRY(saved_magic)      .quad   0
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
new file mode 100644
index 000000000000..bd72d94e713e
--- /dev/null
+++ b/arch/x86/kernel/alternative.c
@@ -0,0 +1,450 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/kprobes.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/alternative.h>
+#include <asm/sections.h>
+#include <asm/pgtable.h>
+#include <asm/mce.h>
+#include <asm/nmi.h>
+#define MAX_PATCH_LEN (255-1)
+#ifdef CONFIG_HOTPLUG_CPU
+static int smp_alt_once;
+static int __init bootonly(char *str)
+{
+        smp_alt_once = 1;
+        return 1;
+}
+__setup("smp-alt-boot", bootonly);
+#else
+#define smp_alt_once 1
+#endif
+static int debug_alternative;
+static int __init debug_alt(char *str)
+{
+        debug_alternative = 1;
+        return 1;
+}
+__setup("debug-alternative", debug_alt);
+static int noreplace_smp;
+static int __init setup_noreplace_smp(char *str)
+{
+        noreplace_smp = 1;
+        return 1;
+}
+__setup("noreplace-smp", setup_noreplace_smp);
+#ifdef CONFIG_PARAVIRT
+static int noreplace_paravirt = 0;
+static int __init setup_noreplace_paravirt(char *str)
+{
+        noreplace_paravirt = 1;
+        return 1;
+}
+__setup("noreplace-paravirt", setup_noreplace_paravirt);
+#endif
+#define DPRINTK(fmt, args...) if (debug_alternative) \
+        printk(KERN_DEBUG fmt, args)
+#ifdef GENERIC_NOP1
+/* Use inline assembly to define this because the nops are defined
+   as inline assembly strings in the include files and we cannot
+   get them easily into strings. */
+asm("\t.data\nintelnops: "
+        GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
+        GENERIC_NOP7 GENERIC_NOP8);
+extern unsigned char intelnops[];
+static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
+        NULL,
+        intelnops,
+        intelnops + 1,
+        intelnops + 1 + 2,
+        intelnops + 1 + 2 + 3,
+        intelnops + 1 + 2 + 3 + 4,
+        intelnops + 1 + 2 + 3 + 4 + 5,
+        intelnops + 1 + 2 + 3 + 4 + 5 + 6,
+        intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+};
+#endif
+#ifdef K8_NOP1
+asm("\t.data\nk8nops: "
+        K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
+        K8_NOP7 K8_NOP8);
+extern unsigned char k8nops[];
+static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
+        NULL,
+        k8nops,
+        k8nops + 1,
+        k8nops + 1 + 2,
+        k8nops + 1 + 2 + 3,
+        k8nops + 1 + 2 + 3 + 4,
+        k8nops + 1 + 2 + 3 + 4 + 5,
+        k8nops + 1 + 2 + 3 + 4 + 5 + 6,
+        k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+};
+#endif
+#ifdef K7_NOP1
+asm("\t.data\nk7nops: "
+        K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
+        K7_NOP7 K7_NOP8);
+extern unsigned char k7nops[];
+static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
+        NULL,
+        k7nops,
+        k7nops + 1,
+        k7nops + 1 + 2,
+        k7nops + 1 + 2 + 3,
+        k7nops + 1 + 2 + 3 + 4,
+        k7nops + 1 + 2 + 3 + 4 + 5,
+        k7nops + 1 + 2 + 3 + 4 + 5 + 6,
+        k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+};
+#endif
+#ifdef CONFIG_X86_64
+extern char __vsyscall_0;
+static inline unsigned char** find_nop_table(void)
+{
+        return k8_nops;
+}
+#else /* CONFIG_X86_64 */
+static struct nop {
+        int cpuid;
+        unsigned char **noptable;
+} noptypes[] = {
+        { X86_FEATURE_K8, k8_nops },
+        { X86_FEATURE_K7, k7_nops },
+        { -1, NULL }
+};
+static unsigned char** find_nop_table(void)
+{
+        unsigned char **noptable = intel_nops;
+        int i;
+        for (i = 0; noptypes[i].cpuid >= 0; i++) {
+                if (boot_cpu_has(noptypes[i].cpuid)) {
+                        noptable = noptypes[i].noptable;
+                        break;
+                }
+        }
+        return noptable;
+}
+#endif /* CONFIG_X86_64 */
+/* Use this to add nops to a buffer, then text_poke the whole buffer. */
+static void add_nops(void *insns, unsigned int len)
+{
+        unsigned char **noptable = find_nop_table();
+        while (len > 0) {
+                unsigned int noplen = len;
+                if (noplen > ASM_NOP_MAX)
+                        noplen = ASM_NOP_MAX;
+                memcpy(insns, noptable[noplen], noplen);
+                insns += noplen;
+                len -= noplen;
+        }
+}
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+extern u8 *__smp_locks[], *__smp_locks_end[];
+/* Replace instructions with better alternatives for this CPU type.
+   This runs before SMP is initialized to avoid SMP problems with
+   self modifying code. This implies that assymetric systems where
+   APs have less capabilities than the boot processor are not handled.
+   Tough. Make sure you disable such features by hand. */
+void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
+{
+        struct alt_instr *a;
+        char insnbuf[MAX_PATCH_LEN];
+        DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
+        for (a = start; a < end; a++) {
+                u8 *instr = a->instr;
+                BUG_ON(a->replacementlen > a->instrlen);
+                BUG_ON(a->instrlen > sizeof(insnbuf));
+                if (!boot_cpu_has(a->cpuid))
+                        continue;
+#ifdef CONFIG_X86_64
+                /* vsyscall code is not mapped yet. resolve it manually. */
+                if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
+                        instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
+                        DPRINTK("%s: vsyscall fixup: %p => %p\n",
+                                __FUNCTION__, a->instr, instr);
+                }
+#endif
+                memcpy(insnbuf, a->replacement, a->replacementlen);
+                add_nops(insnbuf + a->replacementlen,
+                         a->instrlen - a->replacementlen);
+                text_poke(instr, insnbuf, a->instrlen);
+        }
+}
+#ifdef CONFIG_SMP
+static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+{
+        u8 **ptr;
+        for (ptr = start; ptr < end; ptr++) {
+                if (*ptr < text)
+                        continue;
+                if (*ptr > text_end)
+                        continue;
+                text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
+        };
+}
+static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+{
+        u8 **ptr;
+        char insn[1];
+        if (noreplace_smp)
+                return;
+        add_nops(insn, 1);
+        for (ptr = start; ptr < end; ptr++) {
+                if (*ptr < text)
+                        continue;
+                if (*ptr > text_end)
+                        continue;
+                text_poke(*ptr, insn, 1);
+        };
+}
+struct smp_alt_module {
+        /* what is this ??? */
+        struct module   *mod;
+        char            *name;
+        /* ptrs to lock prefixes */
+        u8              **locks;
+        u8              **locks_end;
+        /* .text segment, needed to avoid patching init code ;) */
+        u8              *text;
+        u8              *text_end;
+        struct list_head next;
+};
+static LIST_HEAD(smp_alt_modules);
+static DEFINE_SPINLOCK(smp_alt);
+void alternatives_smp_module_add(struct module *mod, char *name,
+                                 void *locks, void *locks_end,
+                                 void *text,  void *text_end)
+{
+        struct smp_alt_module *smp;
+        unsigned long flags;
+        if (noreplace_smp)
+                return;
+        if (smp_alt_once) {
+                if (boot_cpu_has(X86_FEATURE_UP))
+                        alternatives_smp_unlock(locks, locks_end,
+                                                text, text_end);
+                return;
+        }
+        smp = kzalloc(sizeof(*smp), GFP_KERNEL);
+        if (NULL == smp)
+                return; /* we'll run the (safe but slow) SMP code then ... */
+        smp->mod        = mod;
+        smp->name       = name;
+        smp->locks      = locks;
+        smp->locks_end  = locks_end;
+        smp->text       = text;
+        smp->text_end   = text_end;
+        DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
+                __FUNCTION__, smp->locks, smp->locks_end,
+                smp->text, smp->text_end, smp->name);
+        spin_lock_irqsave(&smp_alt, flags);
+        list_add_tail(&smp->next, &smp_alt_modules);
+        if (boot_cpu_has(X86_FEATURE_UP))
+                alternatives_smp_unlock(smp->locks, smp->locks_end,
+                                        smp->text, smp->text_end);
+        spin_unlock_irqrestore(&smp_alt, flags);
+}
+void alternatives_smp_module_del(struct module *mod)
+{
+        struct smp_alt_module *item;
+        unsigned long flags;
+        if (smp_alt_once || noreplace_smp)
+                return;
+        spin_lock_irqsave(&smp_alt, flags);
+        list_for_each_entry(item, &smp_alt_modules, next) {
+                if (mod != item->mod)
+                        continue;
+                list_del(&item->next);
+                spin_unlock_irqrestore(&smp_alt, flags);
+                DPRINTK("%s: %s\n", __FUNCTION__, item->name);
+                kfree(item);
+                return;
+        }
+        spin_unlock_irqrestore(&smp_alt, flags);
+}
+void alternatives_smp_switch(int smp)
+{
+        struct smp_alt_module *mod;
+        unsigned long flags;
+#ifdef CONFIG_LOCKDEP
+        /*
+         * A not yet fixed binutils section handling bug prevents
+         * alternatives-replacement from working reliably, so turn
+         * it off:
+         */
+        printk("lockdep: not fixing up alternatives.\n");
+        return;
+#endif
+        if (noreplace_smp || smp_alt_once)
+                return;
+        BUG_ON(!smp && (num_online_cpus() > 1));
+        spin_lock_irqsave(&smp_alt, flags);
+        if (smp) {
+                printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
+                clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
+                clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
+                list_for_each_entry(mod, &smp_alt_modules, next)
+                        alternatives_smp_lock(mod->locks, mod->locks_end,
+                                              mod->text, mod->text_end);
+        } else {
+                printk(KERN_INFO "SMP alternatives: switching to UP code\n");
+                set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
+                set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
+                list_for_each_entry(mod, &smp_alt_modules, next)
+                        alternatives_smp_unlock(mod->locks, mod->locks_end,
+                                                mod->text, mod->text_end);
+        }
+        spin_unlock_irqrestore(&smp_alt, flags);
+}
+#endif
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch_site *start,
+                    struct paravirt_patch_site *end)
+{
+        struct paravirt_patch_site *p;
+        char insnbuf[MAX_PATCH_LEN];
+        if (noreplace_paravirt)
+                return;
+        for (p = start; p < end; p++) {
+                unsigned int used;
+                BUG_ON(p->len > MAX_PATCH_LEN);
+                /* prep the buffer with the original instructions */
+                memcpy(insnbuf, p->instr, p->len);
+                used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf,
+                                          (unsigned long)p->instr, p->len);
+                BUG_ON(used > p->len);
+                /* Pad the rest with nops */
+                add_nops(insnbuf + used, p->len - used);
+                text_poke(p->instr, insnbuf, p->len);
+        }
+}
+extern struct paravirt_patch_site __start_parainstructions[],
+        __stop_parainstructions[];
+#endif  /* CONFIG_PARAVIRT */
+void __init alternative_instructions(void)
+{
+        unsigned long flags;
+        /* The patching is not fully atomic, so try to avoid local interruptions
+           that might execute the to be patched code.
+           Other CPUs are not running. */
+        stop_nmi();
+#ifdef CONFIG_X86_MCE
+        stop_mce();
+#endif
+        local_irq_save(flags);
+        apply_alternatives(__alt_instructions, __alt_instructions_end);
+        /* switch to patch-once-at-boottime-only mode and free the
+         * tables in case we know the number of CPUs will never ever
+         * change */
+#ifdef CONFIG_HOTPLUG_CPU
+        if (num_possible_cpus() < 2)
+                smp_alt_once = 1;
+#endif
+#ifdef CONFIG_SMP
+        if (smp_alt_once) {
+                if (1 == num_possible_cpus()) {
+                        printk(KERN_INFO "SMP alternatives: switching to UP code\n");
+                        set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
+                        set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
+                        alternatives_smp_unlock(__smp_locks, __smp_locks_end,
+                                                _text, _etext);
+                }
+                free_init_pages("SMP alternatives",
+                                (unsigned long)__smp_locks,
+                                (unsigned long)__smp_locks_end);
+        } else {
+                alternatives_smp_module_add(NULL, "core kernel",
+                                            __smp_locks, __smp_locks_end,
+                                            _text, _etext);
+                alternatives_smp_switch(0);
+        }
+#endif
+        apply_paravirt(__parainstructions, __parainstructions_end);
+        local_irq_restore(flags);
+        restart_nmi();
+#ifdef CONFIG_X86_MCE
+        restart_mce();
+#endif
+}
+/*
+ * Warning:
+ * When you use this code to patch more than one byte of an instruction
+ * you need to make sure that other CPUs cannot execute this code in parallel.
+ * Also no thread must be currently preempted in the middle of these instructions.
+ * And on the local CPU you need to be protected again NMI or MCE handlers
+ * seeing an inconsistent instruction while you patch.
+ */
+void __kprobes text_poke(void *addr, unsigned char *opcode, int len)
+{
+        memcpy(addr, opcode, len);
+        sync_core();
+        /* Could also do a CLFLUSH here to speed up CPU recovery; but
+           that causes hangs on some VIA CPUs. */
+}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
new file mode 100644
index 000000000000..8f681cae7bf7
--- /dev/null
+++ b/arch/x86/kernel/aperture_64.c
@@ -0,0 +1,298 @@
+/* 
+ * Firmware replacement code.
+ * 
+ * Work around broken BIOSes that don't set an aperture or only set the
+ * aperture in the AGP bridge. 
+ * If all fails map the aperture over some low memory.  This is cheaper than 
+ * doing bounce buffering. The memory is lost. This is done at early boot 
+ * because only the bootmem allocator can allocate 32+MB. 
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <linux/ioport.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/pci-direct.h>
+#include <asm/dma.h>
+#include <asm/k8.h>
+int iommu_aperture;
+int iommu_aperture_disabled __initdata = 0;
+int iommu_aperture_allowed __initdata = 0;
+int fallback_aper_order __initdata = 1; /* 64MB */
+int fallback_aper_force __initdata = 0; 
+int fix_aperture __initdata = 1;
+static struct resource gart_resource = {
+        .name   = "GART",
+        .flags  = IORESOURCE_MEM,
+};
+static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
+{
+        gart_resource.start = aper_base;
+        gart_resource.end = aper_base + aper_size - 1;
+        insert_resource(&iomem_resource, &gart_resource);
+}
+/* This code runs before the PCI subsystem is initialized, so just
+   access the northbridge directly. */
+static u32 __init allocate_aperture(void) 
+{
+        u32 aper_size;
+        void *p; 
+        if (fallback_aper_order > 7) 
+                fallback_aper_order = 7; 
+        aper_size = (32 * 1024 * 1024) << fallback_aper_order; 
+        /* 
+         * Aperture has to be naturally aligned. This means an 2GB aperture won't
+         * have much chance of finding a place in the lower 4GB of memory.
+         * Unfortunately we cannot move it up because that would make the
+         * IOMMU useless.
+         */
+        p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
+        if (!p || __pa(p)+aper_size > 0xffffffff) {
+                printk("Cannot allocate aperture memory hole (%p,%uK)\n",
+                       p, aper_size>>10);
+                if (p)
+                        free_bootmem(__pa(p), aper_size);
+                return 0;
+        }
+        printk("Mapping aperture over %d KB of RAM @ %lx\n",
+               aper_size >> 10, __pa(p)); 
+        insert_aperture_resource((u32)__pa(p), aper_size);
+        return (u32)__pa(p); 
+}
+static int __init aperture_valid(u64 aper_base, u32 aper_size)
+{ 
+        if (!aper_base) 
+                return 0;
+        if (aper_size < 64*1024*1024) { 
+                printk("Aperture too small (%d MB)\n", aper_size>>20);
+                return 0;
+        }
+        if (aper_base + aper_size > 0x100000000UL) {
+                printk("Aperture beyond 4GB. Ignoring.\n");
+                return 0; 
+        }
+        if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
+                printk("Aperture pointing to e820 RAM. Ignoring.\n");
+                return 0; 
+        } 
+        return 1;
+} 
+/* Find a PCI capability */
+static __u32 __init find_cap(int num, int slot, int func, int cap) 
+{ 
+        u8 pos;
+        int bytes;
+        if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
+                return 0;
+        pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
+        for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 
+                u8 id;
+                pos &= ~3; 
+                id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
+                if (id == 0xff)
+                        break;
+                if (id == cap) 
+                        return pos; 
+                pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); 
+        } 
+        return 0;
+} 
+/* Read a standard AGPv3 bridge header */
+static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
+{ 
+        u32 apsize;
+        u32 apsizereg;
+        int nbits;
+        u32 aper_low, aper_hi;
+        u64 aper;
+        printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
+        apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
+        if (apsizereg == 0xffffffff) {
+                printk("APSIZE in AGP bridge unreadable\n");
+                return 0;
+        }
+        apsize = apsizereg & 0xfff;
+        /* Some BIOS use weird encodings not in the AGPv3 table. */
+        if (apsize & 0xff) 
+                apsize |= 0xf00; 
+        nbits = hweight16(apsize);
+        *order = 7 - nbits;
+        if ((int)*order < 0) /* < 32MB */
+                *order = 0;
+        
+        aper_low = read_pci_config(num,slot,func, 0x10);
+        aper_hi = read_pci_config(num,slot,func,0x14);
+        aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+        printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 
+               aper, 32 << *order, apsizereg);
+        if (!aperture_valid(aper, (32*1024*1024) << *order))
+            return 0;
+        return (u32)aper; 
+} 
+/* Look for an AGP bridge. Windows only expects the aperture in the
+   AGP bridge and some BIOS forget to initialize the Northbridge too.
+   Work around this here. 
+   Do an PCI bus scan by hand because we're running before the PCI
+   subsystem. 
+   All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+   generically. It's probably overkill to always scan all slots because
+   the AGP bridges should be always an own bus on the HT hierarchy, 
+   but do it here for future safety. */
+static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+{
+        int num, slot, func;
+        /* Poor man's PCI discovery */
+        for (num = 0; num < 256; num++) { 
+                for (slot = 0; slot < 32; slot++) { 
+                        for (func = 0; func < 8; func++) { 
+                                u32 class, cap;
+                                u8 type;
+                                class = read_pci_config(num,slot,func,
+                                                        PCI_CLASS_REVISION);
+                                if (class == 0xffffffff)
+                                        break; 
+                                
+                                switch (class >> 16) { 
+                                case PCI_CLASS_BRIDGE_HOST:
+                                case PCI_CLASS_BRIDGE_OTHER: /* needed? */
+                                        /* AGP bridge? */
+                                        cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
+                                        if (!cap)
+                                                break;
+                                        *valid_agp = 1; 
+                                        return read_agp(num,slot,func,cap,order);
+                                } 
+                                
+                                /* No multi-function device? */
+                                type = read_pci_config_byte(num,slot,func,
+                                                               PCI_HEADER_TYPE);
+                                if (!(type & 0x80))
+                                        break;
+                        } 
+                } 
+        }
+        printk("No AGP bridge found\n"); 
+        return 0;
+}
+void __init iommu_hole_init(void) 
+{ 
+        int fix, num; 
+        u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
+        u64 aper_base, last_aper_base = 0;
+        int valid_agp = 0;
+        if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed())
+                return;
+        printk(KERN_INFO  "Checking aperture...\n");
+        fix = 0;
+        for (num = 24; num < 32; num++) {               
+                if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+                        continue;
+                iommu_detected = 1;
+                iommu_aperture = 1; 
+                aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 
+                aper_size = (32 * 1024 * 1024) << aper_order; 
+                aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
+                aper_base <<= 25; 
+                printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, 
+                       aper_base, aper_size>>20);
+                
+                if (!aperture_valid(aper_base, aper_size)) {
+                        fix = 1; 
+                        break; 
+                }
+                if ((last_aper_order && aper_order != last_aper_order) ||
+                    (last_aper_base && aper_base != last_aper_base)) {
+                        fix = 1;
+                        break;
+                }
+                last_aper_order = aper_order;
+                last_aper_base = aper_base;
+        } 
+        if (!fix && !fallback_aper_force) {
+                if (last_aper_base) {
+                        unsigned long n = (32 * 1024 * 1024) << last_aper_order;
+                        insert_aperture_resource((u32)last_aper_base, n);
+                }
+                return; 
+        }
+        if (!fallback_aper_force)
+                aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 
+                
+        if (aper_alloc) { 
+                /* Got the aperture from the AGP bridge */
+        } else if (swiotlb && !valid_agp) {
+                /* Do nothing */
+        } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) ||
+                   force_iommu ||
+                   valid_agp ||
+                   fallback_aper_force) { 
+                printk("Your BIOS doesn't leave a aperture memory hole\n");
+                printk("Please enable the IOMMU option in the BIOS setup\n");
+                printk("This costs you %d MB of RAM\n",
+                       32 << fallback_aper_order);
+                aper_order = fallback_aper_order;
+                aper_alloc = allocate_aperture();
+                if (!aper_alloc) { 
+                        /* Could disable AGP and IOMMU here, but it's probably
+                           not worth it. But the later users cannot deal with
+                           bad apertures and turning on the aperture over memory
+                           causes very strange problems, so it's better to 
+                           panic early. */
+                        panic("Not enough memory for aperture");
+                }
+        } else { 
+                return; 
+        } 
+        /* Fix up the north bridges */
+        for (num = 24; num < 32; num++) {               
+                if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+                        continue;       
+                /* Don't enable translation yet. That is done later. 
+                   Assume this BIOS didn't initialise the GART so 
+                   just overwrite all previous bits */ 
+                write_pci_config(0, num, 3, 0x90, aper_order<<1); 
+                write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 
+        } 
+} 
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
new file mode 100644
index 000000000000..3d67ae18d762
--- /dev/null
+++ b/arch/x86/kernel/apic_32.c
@@ -0,0 +1,1566 @@
+/*
+ *      Local APIC handling, local APIC timers
+ *
+ *      (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *      Fixes
+ *      Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
+ *                                      thanks to Eric Gilmore
+ *                                      and Rolf G. Tews
+ *                                      for testing these extensively.
+ *      Maciej W. Rozycki       :       Various updates and fixes.
+ *      Mikael Pettersson       :       Power Management for UP-APIC.
+ *      Pavel Machek and
+ *      Mikael Pettersson       :       PM converted to driver model.
+ */
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <linux/clockchips.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/module.h>
+#include <linux/dmi.h>
+#include <asm/atomic.h>
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/desc.h>
+#include <asm/arch_hooks.h>
+#include <asm/hpet.h>
+#include <asm/i8253.h>
+#include <asm/nmi.h>
+#include <mach_apic.h>
+#include <mach_apicdef.h>
+#include <mach_ipi.h>
+#include "io_ports.h"
+/*
+ * Sanity check
+ */
+#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F
+# error SPURIOUS_APIC_VECTOR definition error
+#endif
+/*
+ * Knob to control our willingness to enable the local APIC.
+ *
+ * -1=force-disable, +1=force-enable
+ */
+static int enable_local_apic __initdata = 0;
+/* Local APIC timer verification ok */
+static int local_apic_timer_verify_ok;
+/* Disable local APIC timer from the kernel commandline or via dmi quirk
+   or using CPU MSR check */
+int local_apic_timer_disabled;
+/* Local APIC timer works in C2 */
+int local_apic_timer_c2_ok;
+EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
+/*
+ * Debug level, exported for io_apic.c
+ */
+int apic_verbosity;
+static unsigned int calibration_result;
+static int lapic_next_event(unsigned long delta,
+                            struct clock_event_device *evt);
+static void lapic_timer_setup(enum clock_event_mode mode,
+                              struct clock_event_device *evt);
+static void lapic_timer_broadcast(cpumask_t mask);
+static void apic_pm_activate(void);
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
+static struct clock_event_device lapic_clockevent = {
+        .name           = "lapic",
+        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
+                        | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
+        .shift          = 32,
+        .set_mode       = lapic_timer_setup,
+        .set_next_event = lapic_next_event,
+        .broadcast      = lapic_timer_broadcast,
+        .rating         = 100,
+        .irq            = -1,
+};
+static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
+/* Local APIC was disabled by the BIOS and enabled by the kernel */
+static int enabled_via_apicbase;
+/*
+ * Get the LAPIC version
+ */
+static inline int lapic_get_version(void)
+{
+        return GET_APIC_VERSION(apic_read(APIC_LVR));
+}
+/*
+ * Check, if the APIC is integrated or a seperate chip
+ */
+static inline int lapic_is_integrated(void)
+{
+        return APIC_INTEGRATED(lapic_get_version());
+}
+/*
+ * Check, whether this is a modern or a first generation APIC
+ */
+static int modern_apic(void)
+{
+        /* AMD systems use old APIC versions, so check the CPU */
+        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+            boot_cpu_data.x86 >= 0xf)
+                return 1;
+        return lapic_get_version() >= 0x14;
+}
+void apic_wait_icr_idle(void)
+{
+        while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
+                cpu_relax();
+}
+unsigned long safe_apic_wait_icr_idle(void)
+{
+        unsigned long send_status;
+        int timeout;
+        timeout = 0;
+        do {
+                send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+                if (!send_status)
+                        break;
+                udelay(100);
+        } while (timeout++ < 1000);
+        return send_status;
+}
+/**
+ * enable_NMI_through_LVT0 - enable NMI through local vector table 0
+ */
+void enable_NMI_through_LVT0 (void * dummy)
+{
+        unsigned int v = APIC_DM_NMI;
+        /* Level triggered for 82489DX */
+        if (!lapic_is_integrated())
+                v |= APIC_LVT_LEVEL_TRIGGER;
+        apic_write_around(APIC_LVT0, v);
+}
+/**
+ * get_physical_broadcast - Get number of physical broadcast IDs
+ */
+int get_physical_broadcast(void)
+{
+        return modern_apic() ? 0xff : 0xf;
+}
+/**
+ * lapic_get_maxlvt - get the maximum number of local vector table entries
+ */
+int lapic_get_maxlvt(void)
+{
+        unsigned int v = apic_read(APIC_LVR);
+        /* 82489DXs do not report # of LVT entries. */
+        return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
+}
+/*
+ * Local APIC timer
+ */
+/* Clock divisor is set to 16 */
+#define APIC_DIVISOR 16
+/*
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
+ */
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
+{
+        unsigned int lvtt_value, tmp_value;
+        lvtt_value = LOCAL_TIMER_VECTOR;
+        if (!oneshot)
+                lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+        if (!lapic_is_integrated())
+                lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+        if (!irqen)
+                lvtt_value |= APIC_LVT_MASKED;
+        apic_write_around(APIC_LVTT, lvtt_value);
+        /*
+         * Divide PICLK by 16
+         */
+        tmp_value = apic_read(APIC_TDCR);
+        apic_write_around(APIC_TDCR, (tmp_value
+                                & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+                                | APIC_TDR_DIV_16);
+        if (!oneshot)
+                apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+}
+/*
+ * Program the next event, relative to now
+ */
+static int lapic_next_event(unsigned long delta,
+                            struct clock_event_device *evt)
+{
+        apic_write_around(APIC_TMICT, delta);
+        return 0;
+}
+/*
+ * Setup the lapic timer in periodic or oneshot mode
+ */
+static void lapic_timer_setup(enum clock_event_mode mode,
+                              struct clock_event_device *evt)
+{
+        unsigned long flags;
+        unsigned int v;
+        /* Lapic used for broadcast ? */
+        if (!local_apic_timer_verify_ok)
+                return;
+        local_irq_save(flags);
+        switch (mode) {
+        case CLOCK_EVT_MODE_PERIODIC:
+        case CLOCK_EVT_MODE_ONESHOT:
+                __setup_APIC_LVTT(calibration_result,
+                                  mode != CLOCK_EVT_MODE_PERIODIC, 1);
+                break;
+        case CLOCK_EVT_MODE_UNUSED:
+        case CLOCK_EVT_MODE_SHUTDOWN:
+                v = apic_read(APIC_LVTT);
+                v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+                apic_write_around(APIC_LVTT, v);
+                break;
+        case CLOCK_EVT_MODE_RESUME:
+                /* Nothing to do here */
+                break;
+        }
+        local_irq_restore(flags);
+}
+/*
+ * Local APIC timer broadcast function
+ */
+static void lapic_timer_broadcast(cpumask_t mask)
+{
+#ifdef CONFIG_SMP
+        send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+#endif
+}
+/*
+ * Setup the local APIC timer for this CPU. Copy the initilized values
+ * of the boot CPU and register the clock event in the framework.
+ */
+static void __devinit setup_APIC_timer(void)
+{
+        struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+        memcpy(levt, &lapic_clockevent, sizeof(*levt));
+        levt->cpumask = cpumask_of_cpu(smp_processor_id());
+        clockevents_register_device(levt);
+}
+/*
+ * In this functions we calibrate APIC bus clocks to the external timer.
+ *
+ * We want to do the calibration only once since we want to have local timer
+ * irqs syncron. CPUs connected by the same APIC bus have the very same bus
+ * frequency.
+ *
+ * This was previously done by reading the PIT/HPET and waiting for a wrap
+ * around to find out, that a tick has elapsed. I have a box, where the PIT
+ * readout is broken, so it never gets out of the wait loop again. This was
+ * also reported by others.
+ *
+ * Monitoring the jiffies value is inaccurate and the clockevents
+ * infrastructure allows us to do a simple substitution of the interrupt
+ * handler.
+ *
+ * The calibration routine also uses the pm_timer when possible, as the PIT
+ * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes
+ * back to normal later in the boot process).
+ */
+#define LAPIC_CAL_LOOPS         (HZ/10)
+static __initdata int lapic_cal_loops = -1;
+static __initdata long lapic_cal_t1, lapic_cal_t2;
+static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
+static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
+/*
+ * Temporary interrupt handler.
+ */
+static void __init lapic_cal_handler(struct clock_event_device *dev)
+{
+        unsigned long long tsc = 0;
+        long tapic = apic_read(APIC_TMCCT);
+        unsigned long pm = acpi_pm_read_early();
+        if (cpu_has_tsc)
+                rdtscll(tsc);
+        switch (lapic_cal_loops++) {
+        case 0:
+                lapic_cal_t1 = tapic;
+                lapic_cal_tsc1 = tsc;
+                lapic_cal_pm1 = pm;
+                lapic_cal_j1 = jiffies;
+                break;
+        case LAPIC_CAL_LOOPS:
+                lapic_cal_t2 = tapic;
+                lapic_cal_tsc2 = tsc;
+                if (pm < lapic_cal_pm1)
+                        pm += ACPI_PM_OVRRUN;
+                lapic_cal_pm2 = pm;
+                lapic_cal_j2 = jiffies;
+                break;
+        }
+}
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
+void __init setup_boot_APIC_clock(void)
+{
+        struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+        const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
+        const long pm_thresh = pm_100ms/100;
+        void (*real_handler)(struct clock_event_device *dev);
+        unsigned long deltaj;
+        long delta, deltapm;
+        int pm_referenced = 0;
+        /*
+         * The local apic timer can be disabled via the kernel
+         * commandline or from the CPU detection code. Register the lapic
+         * timer as a dummy clock event source on SMP systems, so the
+         * broadcast mechanism is used. On UP systems simply ignore it.
+         */
+        if (local_apic_timer_disabled) {
+                /* No broadcast on UP ! */
+                if (num_possible_cpus() > 1)
+                        setup_APIC_timer();
+                return;
+        }
+        apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+                    "calibrating APIC timer ...\n");
+        local_irq_disable();
+        /* Replace the global interrupt handler */
+        real_handler = global_clock_event->event_handler;
+        global_clock_event->event_handler = lapic_cal_handler;
+        /*
+         * Setup the APIC counter to 1e9. There is no way the lapic
+         * can underflow in the 100ms detection time frame
+         */
+        __setup_APIC_LVTT(1000000000, 0, 0);
+        /* Let the interrupts run */
+        local_irq_enable();
+        while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
+                cpu_relax();
+        local_irq_disable();
+        /* Restore the real event handler */
+        global_clock_event->event_handler = real_handler;
+        /* Build delta t1-t2 as apic timer counts down */
+        delta = lapic_cal_t1 - lapic_cal_t2;
+        apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+        /* Check, if the PM timer is available */
+        deltapm = lapic_cal_pm2 - lapic_cal_pm1;
+        apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
+        if (deltapm) {
+                unsigned long mult;
+                u64 res;
+                mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
+                if (deltapm > (pm_100ms - pm_thresh) &&
+                    deltapm < (pm_100ms + pm_thresh)) {
+                        apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
+                } else {
+                        res = (((u64) deltapm) *  mult) >> 22;
+                        do_div(res, 1000000);
+                        printk(KERN_WARNING "APIC calibration not consistent "
+                               "with PM Timer: %ldms instead of 100ms\n",
+                               (long)res);
+                        /* Correct the lapic counter value */
+                        res = (((u64) delta ) * pm_100ms);
+                        do_div(res, deltapm);
+                        printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
+                               "%lu (%ld)\n", (unsigned long) res, delta);
+                        delta = (long) res;
+                }
+                pm_referenced = 1;
+        }
+        /* Calculate the scaled math multiplication factor */
+        lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32);
+        lapic_clockevent.max_delta_ns =
+                clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+        lapic_clockevent.min_delta_ns =
+                clockevent_delta2ns(0xF, &lapic_clockevent);
+        calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+        apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
+        apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult);
+        apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
+                    calibration_result);
+        if (cpu_has_tsc) {
+                delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
+                apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
+                            "%ld.%04ld MHz.\n",
+                            (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ),
+                            (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+        }
+        apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
+                    "%u.%04u MHz.\n",
+                    calibration_result / (1000000 / HZ),
+                    calibration_result % (1000000 / HZ));
+        local_apic_timer_verify_ok = 1;
+        /* We trust the pm timer based calibration */
+        if (!pm_referenced) {
+                apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
+                /*
+                 * Setup the apic timer manually
+                 */
+                levt->event_handler = lapic_cal_handler;
+                lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+                lapic_cal_loops = -1;
+                /* Let the interrupts run */
+                local_irq_enable();
+                while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
+                        cpu_relax();
+                local_irq_disable();
+                /* Stop the lapic timer */
+                lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+                local_irq_enable();
+                /* Jiffies delta */
+                deltaj = lapic_cal_j2 - lapic_cal_j1;
+                apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
+                /* Check, if the jiffies result is consistent */
+                if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
+                        apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
+                else
+                        local_apic_timer_verify_ok = 0;
+        } else
+                local_irq_enable();
+        if (!local_apic_timer_verify_ok) {
+                printk(KERN_WARNING
+                       "APIC timer disabled due to verification failure.\n");
+                /* No broadcast on UP ! */
+                if (num_possible_cpus() == 1)
+                        return;
+        } else {
+                /*
+                 * If nmi_watchdog is set to IO_APIC, we need the
+                 * PIT/HPET going.  Otherwise register lapic as a dummy
+                 * device.
+                 */
+                if (nmi_watchdog != NMI_IO_APIC)
+                        lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+                else
+                        printk(KERN_WARNING "APIC timer registered as dummy,"
+                               " due to nmi_watchdog=1!\n");
+        }
+        /* Setup the lapic or request the broadcast */
+        setup_APIC_timer();
+}
+void __devinit setup_secondary_APIC_clock(void)
+{
+        setup_APIC_timer();
+}
+/*
+ * The guts of the apic timer interrupt
+ */
+static void local_apic_timer_interrupt(void)
+{
+        int cpu = smp_processor_id();
+        struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
+        /*
+         * Normally we should not be here till LAPIC has been initialized but
+         * in some cases like kdump, its possible that there is a pending LAPIC
+         * timer interrupt from previous kernel's context and is delivered in
+         * new kernel the moment interrupts are enabled.
+         *
+         * Interrupts are enabled early and LAPIC is setup much later, hence
+         * its possible that when we get here evt->event_handler is NULL.
+         * Check for event_handler being NULL and discard the interrupt as
+         * spurious.
+         */
+        if (!evt->event_handler) {
+                printk(KERN_WARNING
+                       "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+                /* Switch it off */
+                lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+                return;
+        }
+        per_cpu(irq_stat, cpu).apic_timer_irqs++;
+        evt->event_handler(evt);
+}
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ *   interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+        struct pt_regs *old_regs = set_irq_regs(regs);
+        /*
+         * NOTE! We'd better ACK the irq immediately,
+         * because timer handling can be slow.
+         */
+        ack_APIC_irq();
+        /*
+         * update_process_times() expects us to have done irq_enter().
+         * Besides, if we don't timer interrupts ignore the global
+         * interrupt lock, which is the WrongThing (tm) to do.
+         */
+        irq_enter();
+        local_apic_timer_interrupt();
+        irq_exit();
+        set_irq_regs(old_regs);
+}
+int setup_profiling_timer(unsigned int multiplier)
+{
+        return -EINVAL;
+}
+/*
+ * Local APIC start and shutdown
+ */
+/**
+ * clear_local_APIC - shutdown the local APIC
+ *
+ * This is called, when a CPU is disabled and before rebooting, so the state of
+ * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
+ * leftovers during boot.
+ */
+void clear_local_APIC(void)
+{
+        int maxlvt = lapic_get_maxlvt();
+        unsigned long v;
+        /*
+         * Masking an LVT entry can trigger a local APIC error
+         * if the vector is zero. Mask LVTERR first to prevent this.
+         */
+        if (maxlvt >= 3) {
+                v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
+                apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
+        }
+        /*
+         * Careful: we have to set masks only first to deassert
+         * any level-triggered sources.
+         */
+        v = apic_read(APIC_LVTT);
+        apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+        v = apic_read(APIC_LVT0);
+        apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+        v = apic_read(APIC_LVT1);
+        apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
+        if (maxlvt >= 4) {
+                v = apic_read(APIC_LVTPC);
+                apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
+        }
+        /* lets not touch this if we didn't frob it */
+#ifdef CONFIG_X86_MCE_P4THERMAL
+        if (maxlvt >= 5) {
+                v = apic_read(APIC_LVTTHMR);
+                apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
+        }
+#endif
+        /*
+         * Clean APIC state for other OSs:
+         */
+        apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
+        apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+        apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
+        if (maxlvt >= 3)
+                apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
+        if (maxlvt >= 4)
+                apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+        if (maxlvt >= 5)
+                apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
+#endif
+        /* Integrated APIC (!82489DX) ? */
+        if (lapic_is_integrated()) {
+                if (maxlvt > 3)
+                        /* Clear ESR due to Pentium errata 3AP and 11AP */
+                        apic_write(APIC_ESR, 0);
+                apic_read(APIC_ESR);
+        }
+}
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
+void disable_local_APIC(void)
+{
+        unsigned long value;
+        clear_local_APIC();
+        /*
+         * Disable APIC (implies clearing of registers
+         * for 82489DX!).
+         */
+        value = apic_read(APIC_SPIV);
+        value &= ~APIC_SPIV_APIC_ENABLED;
+        apic_write_around(APIC_SPIV, value);
+        /*
+         * When LAPIC was disabled by the BIOS and enabled by the kernel,
+         * restore the disabled state.
+         */
+        if (enabled_via_apicbase) {
+                unsigned int l, h;
+                rdmsr(MSR_IA32_APICBASE, l, h);
+                l &= ~MSR_IA32_APICBASE_ENABLE;
+                wrmsr(MSR_IA32_APICBASE, l, h);
+        }
+}
+/*
+ * If Linux enabled the LAPIC against the BIOS default disable it down before
+ * re-entering the BIOS on shutdown.  Otherwise the BIOS may get confused and
+ * not power-off.  Additionally clear all LVT entries before disable_local_APIC
+ * for the case where Linux didn't enable the LAPIC.
+ */
+void lapic_shutdown(void)
+{
+        unsigned long flags;
+        if (!cpu_has_apic)
+                return;
+        local_irq_save(flags);
+        clear_local_APIC();
+        if (enabled_via_apicbase)
+                disable_local_APIC();
+        local_irq_restore(flags);
+}
+/*
+ * This is to verify that we're looking at a real local APIC.
+ * Check these against your board if the CPUs aren't getting
+ * started for no apparent reason.
+ */
+int __init verify_local_APIC(void)
+{
+        unsigned int reg0, reg1;
+        /*
+         * The version register is read-only in a real APIC.
+         */
+        reg0 = apic_read(APIC_LVR);
+        apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
+        apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
+        reg1 = apic_read(APIC_LVR);
+        apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
+        /*
+         * The two version reads above should print the same
+         * numbers.  If the second one is different, then we
+         * poke at a non-APIC.
+         */
+        if (reg1 != reg0)
+                return 0;
+        /*
+         * Check if the version looks reasonably.
+         */
+        reg1 = GET_APIC_VERSION(reg0);
+        if (reg1 == 0x00 || reg1 == 0xff)
+                return 0;
+        reg1 = lapic_get_maxlvt();
+        if (reg1 < 0x02 || reg1 == 0xff)
+                return 0;
+        /*
+         * The ID register is read/write in a real APIC.
+         */
+        reg0 = apic_read(APIC_ID);
+        apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+        /*
+         * The next two are just to see if we have sane values.
+         * They're only really relevant if we're in Virtual Wire
+         * compatibility mode, but most boxes are anymore.
+         */
+        reg0 = apic_read(APIC_LVT0);
+        apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
+        reg1 = apic_read(APIC_LVT1);
+        apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
+        return 1;
+}
+/**
+ * sync_Arb_IDs - synchronize APIC bus arbitration IDs
+ */
+void __init sync_Arb_IDs(void)
+{
+        /*
+         * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
+         * needed on AMD.
+         */
+        if (modern_apic())
+                return;
+        /*
+         * Wait for idle.
+         */
+        apic_wait_icr_idle();
+        apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
+        apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
+                                | APIC_DM_INIT);
+}
+/*
+ * An initial setup of the virtual wire mode.
+ */
+void __init init_bsp_APIC(void)
+{
+        unsigned long value;
+        /*
+         * Don't do the setup now if we have a SMP BIOS as the
+         * through-I/O-APIC virtual wire mode might be active.
+         */
+        if (smp_found_config || !cpu_has_apic)
+                return;
+        /*
+         * Do not trust the local APIC being empty at bootup.
+         */
+        clear_local_APIC();
+        /*
+         * Enable APIC.
+         */
+        value = apic_read(APIC_SPIV);
+        value &= ~APIC_VECTOR_MASK;
+        value |= APIC_SPIV_APIC_ENABLED;
+        /* This bit is reserved on P4/Xeon and should be cleared */
+        if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+            (boot_cpu_data.x86 == 15))
+                value &= ~APIC_SPIV_FOCUS_DISABLED;
+        else
+                value |= APIC_SPIV_FOCUS_DISABLED;
+        value |= SPURIOUS_APIC_VECTOR;
+        apic_write_around(APIC_SPIV, value);
+        /*
+         * Set up the virtual wire mode.
+         */
+        apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+        value = APIC_DM_NMI;
+        if (!lapic_is_integrated())             /* 82489DX */
+                value |= APIC_LVT_LEVEL_TRIGGER;
+        apic_write_around(APIC_LVT1, value);
+}
+/**
+ * setup_local_APIC - setup the local APIC
+ */
+void __devinit setup_local_APIC(void)
+{
+        unsigned long oldvalue, value, maxlvt, integrated;
+        int i, j;
+        /* Pound the ESR really hard over the head with a big hammer - mbligh */
+        if (esr_disable) {
+                apic_write(APIC_ESR, 0);
+                apic_write(APIC_ESR, 0);
+                apic_write(APIC_ESR, 0);
+                apic_write(APIC_ESR, 0);
+        }
+        integrated = lapic_is_integrated();
+        /*
+         * Double-check whether this APIC is really registered.
+         */
+        if (!apic_id_registered())
+                BUG();
+        /*
+         * Intel recommends to set DFR, LDR and TPR before enabling
+         * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
+         * document number 292116).  So here it goes...
+         */
+        init_apic_ldr();
+        /*
+         * Set Task Priority to 'accept all'. We never change this
+         * later on.
+         */
+        value = apic_read(APIC_TASKPRI);
+        value &= ~APIC_TPRI_MASK;
+        apic_write_around(APIC_TASKPRI, value);
+        /*
+         * After a crash, we no longer service the interrupts and a pending
+         * interrupt from previous kernel might still have ISR bit set.
+         *
+         * Most probably by now CPU has serviced that pending interrupt and
+         * it might not have done the ack_APIC_irq() because it thought,
+         * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
+         * does not clear the ISR bit and cpu thinks it has already serivced
+         * the interrupt. Hence a vector might get locked. It was noticed
+         * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
+         */
+        for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+                value = apic_read(APIC_ISR + i*0x10);
+                for (j = 31; j >= 0; j--) {
+                        if (value & (1<<j))
+                                ack_APIC_irq();
+                }
+        }
+        /*
+         * Now that we are all set up, enable the APIC
+         */
+        value = apic_read(APIC_SPIV);
+        value &= ~APIC_VECTOR_MASK;
+        /*
+         * Enable APIC
+         */
+        value |= APIC_SPIV_APIC_ENABLED;
+        /*
+         * Some unknown Intel IO/APIC (or APIC) errata is biting us with
+         * certain networking cards. If high frequency interrupts are
+         * happening on a particular IOAPIC pin, plus the IOAPIC routing
+         * entry is masked/unmasked at a high rate as well then sooner or
+         * later IOAPIC line gets 'stuck', no more interrupts are received
+         * from the device. If focus CPU is disabled then the hang goes
+         * away, oh well :-(
+         *
+         * [ This bug can be reproduced easily with a level-triggered
+         *   PCI Ne2000 networking cards and PII/PIII processors, dual
+         *   BX chipset. ]
+         */
+        /*
+         * Actually disabling the focus CPU check just makes the hang less
+         * frequent as it makes the interrupt distributon model be more
+         * like LRU than MRU (the short-term load is more even across CPUs).
+         * See also the comment in end_level_ioapic_irq().  --macro
+         */
+        /* Enable focus processor (bit==0) */
+        value &= ~APIC_SPIV_FOCUS_DISABLED;
+        /*
+         * Set spurious IRQ vector
+         */
+        value |= SPURIOUS_APIC_VECTOR;
+        apic_write_around(APIC_SPIV, value);
+        /*
+         * Set up LVT0, LVT1:
+         *
+         * set up through-local-APIC on the BP's LINT0. This is not
+         * strictly necessery in pure symmetric-IO mode, but sometimes
+         * we delegate interrupts to the 8259A.
+         */
+        /*
+         * TODO: set up through-local-APIC from through-I/O-APIC? --macro
+         */
+        value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
+        if (!smp_processor_id() && (pic_mode || !value)) {
+                value = APIC_DM_EXTINT;
+                apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
+                                smp_processor_id());
+        } else {
+                value = APIC_DM_EXTINT | APIC_LVT_MASKED;
+                apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
+                                smp_processor_id());
+        }
+        apic_write_around(APIC_LVT0, value);
+        /*
+         * only the BP should see the LINT1 NMI signal, obviously.
+         */
+        if (!smp_processor_id())
+                value = APIC_DM_NMI;
+        else
+                value = APIC_DM_NMI | APIC_LVT_MASKED;
+        if (!integrated)                /* 82489DX */
+                value |= APIC_LVT_LEVEL_TRIGGER;
+        apic_write_around(APIC_LVT1, value);
+        if (integrated && !esr_disable) {               /* !82489DX */
+                maxlvt = lapic_get_maxlvt();
+                if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
+                        apic_write(APIC_ESR, 0);
+                oldvalue = apic_read(APIC_ESR);
+                /* enables sending errors */
+                value = ERROR_APIC_VECTOR;
+                apic_write_around(APIC_LVTERR, value);
+                /*
+                 * spec says clear errors after enabling vector.
+                 */
+                if (maxlvt > 3)
+                        apic_write(APIC_ESR, 0);
+                value = apic_read(APIC_ESR);
+                if (value != oldvalue)
+                        apic_printk(APIC_VERBOSE, "ESR value before enabling "
+                                "vector: 0x%08lx  after: 0x%08lx\n",
+                                oldvalue, value);
+        } else {
+                if (esr_disable)
+                        /*
+                         * Something untraceble is creating bad interrupts on
+                         * secondary quads ... for the moment, just leave the
+                         * ESR disabled - we can't do anything useful with the
+                         * errors anyway - mbligh
+                         */
+                        printk(KERN_INFO "Leaving ESR disabled.\n");
+                else
+                        printk(KERN_INFO "No ESR for 82489DX.\n");
+        }
+        /* Disable the local apic timer */
+        value = apic_read(APIC_LVTT);
+        value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+        apic_write_around(APIC_LVTT, value);
+        setup_apic_nmi_watchdog(NULL);
+        apic_pm_activate();
+}
+/*
+ * Detect and initialize APIC
+ */
+static int __init detect_init_APIC (void)
+{
+        u32 h, l, features;
+        /* Disabled by kernel option? */
+        if (enable_local_apic < 0)
+                return -1;
+        switch (boot_cpu_data.x86_vendor) {
+        case X86_VENDOR_AMD:
+                if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
+                    (boot_cpu_data.x86 == 15))
+                        break;
+                goto no_apic;
+        case X86_VENDOR_INTEL:
+                if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
+                    (boot_cpu_data.x86 == 5 && cpu_has_apic))
+                        break;
+                goto no_apic;
+        default:
+                goto no_apic;
+        }
+        if (!cpu_has_apic) {
+                /*
+                 * Over-ride BIOS and try to enable the local APIC only if
+                 * "lapic" specified.
+                 */
+                if (enable_local_apic <= 0) {
+                        printk(KERN_INFO "Local APIC disabled by BIOS -- "
+                               "you can enable it with \"lapic\"\n");
+                        return -1;
+                }
+                /*
+                 * Some BIOSes disable the local APIC in the APIC_BASE
+                 * MSR. This can only be done in software for Intel P6 or later
+                 * and AMD K7 (Model > 1) or later.
+                 */
+                rdmsr(MSR_IA32_APICBASE, l, h);
+                if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+                        printk(KERN_INFO
+                               "Local APIC disabled by BIOS -- reenabling.\n");
+                        l &= ~MSR_IA32_APICBASE_BASE;
+                        l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+                        wrmsr(MSR_IA32_APICBASE, l, h);
+                        enabled_via_apicbase = 1;
+                }
+        }
+        /*
+         * The APIC feature bit should now be enabled
+         * in `cpuid'
+         */
+        features = cpuid_edx(1);
+        if (!(features & (1 << X86_FEATURE_APIC))) {
+                printk(KERN_WARNING "Could not enable APIC!\n");
+                return -1;
+        }
+        set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+        mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+        /* The BIOS may have set up the APIC at some other address */
+        rdmsr(MSR_IA32_APICBASE, l, h);
+        if (l & MSR_IA32_APICBASE_ENABLE)
+                mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+        if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
+                nmi_watchdog = NMI_LOCAL_APIC;
+        printk(KERN_INFO "Found and enabled local APIC!\n");
+        apic_pm_activate();
+        return 0;
+no_apic:
+        printk(KERN_INFO "No local APIC present or hardware disabled\n");
+        return -1;
+}
+/**
+ * init_apic_mappings - initialize APIC mappings
+ */
+void __init init_apic_mappings(void)
+{
+        unsigned long apic_phys;
+        /*
+         * If no local APIC can be found then set up a fake all
+         * zeroes page to simulate the local APIC and another
+         * one for the IO-APIC.
+         */
+        if (!smp_found_config && detect_init_APIC()) {
+                apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
+                apic_phys = __pa(apic_phys);
+        } else
+                apic_phys = mp_lapic_addr;
+        set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+        printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
+               apic_phys);
+        /*
+         * Fetch the APIC ID of the BSP in case we have a
+         * default configuration (or the MP table is broken).
+         */
+        if (boot_cpu_physical_apicid == -1U)
+                boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+#ifdef CONFIG_X86_IO_APIC
+        {
+                unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+                int i;
+                for (i = 0; i < nr_ioapics; i++) {
+                        if (smp_found_config) {
+                                ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+                                if (!ioapic_phys) {
+                                        printk(KERN_ERR
+                                               "WARNING: bogus zero IO-APIC "
+                                               "address found in MPTABLE, "
+                                               "disabling IO/APIC support!\n");
+                                        smp_found_config = 0;
+                                        skip_ioapic_setup = 1;
+                                        goto fake_ioapic_page;
+                                }
+                        } else {
+fake_ioapic_page:
+                                ioapic_phys = (unsigned long)
+                                              alloc_bootmem_pages(PAGE_SIZE);
+                                ioapic_phys = __pa(ioapic_phys);
+                        }
+                        set_fixmap_nocache(idx, ioapic_phys);
+                        printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
+                               __fix_to_virt(idx), ioapic_phys);
+                        idx++;
+                }
+        }
+#endif
+}
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor (void)
+{
+        if (enable_local_apic < 0)
+                clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+        if (!smp_found_config && !cpu_has_apic)
+                return -1;
+        /*
+         * Complain if the BIOS pretends there is one.
+         */
+        if (!cpu_has_apic &&
+            APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+                printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+                       boot_cpu_physical_apicid);
+                clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+                return -1;
+        }
+        verify_local_APIC();
+        connect_bsp_APIC();
+        /*
+         * Hack: In case of kdump, after a crash, kernel might be booting
+         * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
+         * might be zero if read from MP tables. Get it from LAPIC.
+         */
+#ifdef CONFIG_CRASH_DUMP
+        boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+#endif
+        phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+        setup_local_APIC();
+#ifdef CONFIG_X86_IO_APIC
+        if (smp_found_config)
+                if (!skip_ioapic_setup && nr_ioapics)
+                        setup_IO_APIC();
+#endif
+        setup_boot_clock();
+        return 0;
+}
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+        enable_local_apic = 1;
+        return 0;
+}
+early_param("lapic", parse_lapic);
+static int __init parse_nolapic(char *arg)
+{
+        enable_local_apic = -1;
+        clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+        return 0;
+}
+early_param("nolapic", parse_nolapic);
+static int __init parse_disable_lapic_timer(char *arg)
+{
+        local_apic_timer_disabled = 1;
+        return 0;
+}
+early_param("nolapic_timer", parse_disable_lapic_timer);
+static int __init parse_lapic_timer_c2_ok(char *arg)
+{
+        local_apic_timer_c2_ok = 1;
+        return 0;
+}
+early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
+static int __init apic_set_verbosity(char *str)
+{
+        if (strcmp("debug", str) == 0)
+                apic_verbosity = APIC_DEBUG;
+        else if (strcmp("verbose", str) == 0)
+                apic_verbosity = APIC_VERBOSE;
+        return 1;
+}
+__setup("apic=", apic_set_verbosity);
+/*
+ * Local APIC interrupts
+ */
+/*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+void smp_spurious_interrupt(struct pt_regs *regs)
+{
+        unsigned long v;
+        irq_enter();
+        /*
+         * Check if this really is a spurious interrupt and ACK it
+         * if it is a vectored one.  Just in case...
+         * Spurious interrupts should not be ACKed.
+         */
+        v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
+        if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
+                ack_APIC_irq();
+        /* see sw-dev-man vol 3, chapter 7.4.13.5 */
+        printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
+               "should never happen.\n", smp_processor_id());
+        irq_exit();
+}
+/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+void smp_error_interrupt(struct pt_regs *regs)
+{
+        unsigned long v, v1;
+        irq_enter();
+        /* First tickle the hardware, only then report what went on. -- REW */
+        v = apic_read(APIC_ESR);
+        apic_write(APIC_ESR, 0);
+        v1 = apic_read(APIC_ESR);
+        ack_APIC_irq();
+        atomic_inc(&irq_err_count);
+        /* Here is what the APIC error bits mean:
+           0: Send CS error
+           1: Receive CS error
+           2: Send accept error
+           3: Receive accept error
+           4: Reserved
+           5: Send illegal vector
+           6: Received illegal vector
+           7: Illegal register address
+        */
+        printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
+                smp_processor_id(), v , v1);
+        irq_exit();
+}
+/*
+ * Initialize APIC interrupts
+ */
+void __init apic_intr_init(void)
+{
+#ifdef CONFIG_SMP
+        smp_intr_init();
+#endif
+        /* self generated IPI for local APIC timer */
+        set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+        /* IPI vectors for APIC spurious and error interrupts */
+        set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+        set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+        /* thermal monitor LVT interrupt */
+#ifdef CONFIG_X86_MCE_P4THERMAL
+        set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+}
+/**
+ * connect_bsp_APIC - attach the APIC to the interrupt system
+ */
+void __init connect_bsp_APIC(void)
+{
+        if (pic_mode) {
+                /*
+                 * Do not trust the local APIC being empty at bootup.
+                 */
+                clear_local_APIC();
+                /*
+                 * PIC mode, enable APIC mode in the IMCR, i.e.  connect BSP's
+                 * local APIC to INT and NMI lines.
+                 */
+                apic_printk(APIC_VERBOSE, "leaving PIC mode, "
+                                "enabling APIC mode.\n");
+                outb(0x70, 0x22);
+                outb(0x01, 0x23);
+        }
+        enable_apic_mode();
+}
+/**
+ * disconnect_bsp_APIC - detach the APIC from the interrupt system
+ * @virt_wire_setup:    indicates, whether virtual wire mode is selected
+ *
+ * Virtual wire mode is necessary to deliver legacy interrupts even when the
+ * APIC is disabled.
+ */
+void disconnect_bsp_APIC(int virt_wire_setup)
+{
+        if (pic_mode) {
+                /*
+                 * Put the board back into PIC mode (has an effect only on
+                 * certain older boards).  Note that APIC interrupts, including
+                 * IPIs, won't work beyond this point!  The only exception are
+                 * INIT IPIs.
+                 */
+                apic_printk(APIC_VERBOSE, "disabling APIC mode, "
+                                "entering PIC mode.\n");
+                outb(0x70, 0x22);
+                outb(0x00, 0x23);
+        } else {
+                /* Go back to Virtual Wire compatibility mode */
+                unsigned long value;
+                /* For the spurious interrupt use vector F, and enable it */
+                value = apic_read(APIC_SPIV);
+                value &= ~APIC_VECTOR_MASK;
+                value |= APIC_SPIV_APIC_ENABLED;
+                value |= 0xf;
+                apic_write_around(APIC_SPIV, value);
+                if (!virt_wire_setup) {
+                        /*
+                         * For LVT0 make it edge triggered, active high,
+                         * external and enabled
+                         */
+                        value = apic_read(APIC_LVT0);
+                        value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+                                APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                                APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+                        value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+                        value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+                        apic_write_around(APIC_LVT0, value);
+                } else {
+                        /* Disable LVT0 */
+                        apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+                }
+                /*
+                 * For LVT1 make it edge triggered, active high, nmi and
+                 * enabled
+                 */
+                value = apic_read(APIC_LVT1);
+                value &= ~(
+                        APIC_MODE_MASK | APIC_SEND_PENDING |
+                        APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                        APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+                value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+                value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+                apic_write_around(APIC_LVT1, value);
+        }
+}
+/*
+ * Power management
+ */
+#ifdef CONFIG_PM
+static struct {
+        int active;
+        /* r/w apic fields */
+        unsigned int apic_id;
+        unsigned int apic_taskpri;
+        unsigned int apic_ldr;
+        unsigned int apic_dfr;
+        unsigned int apic_spiv;
+        unsigned int apic_lvtt;
+        unsigned int apic_lvtpc;
+        unsigned int apic_lvt0;
+        unsigned int apic_lvt1;
+        unsigned int apic_lvterr;
+        unsigned int apic_tmict;
+        unsigned int apic_tdcr;
+        unsigned int apic_thmr;
+} apic_pm_state;
+static int lapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+        unsigned long flags;
+        int maxlvt;
+        if (!apic_pm_state.active)
+                return 0;
+        maxlvt = lapic_get_maxlvt();
+        apic_pm_state.apic_id = apic_read(APIC_ID);
+        apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+        apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+        apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+        apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+        apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+        if (maxlvt >= 4)
+                apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+        apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+        apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+        apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+        apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+        apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+        if (maxlvt >= 5)
+                apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
+        local_irq_save(flags);
+        disable_local_APIC();
+        local_irq_restore(flags);
+        return 0;
+}
+static int lapic_resume(struct sys_device *dev)
+{
+        unsigned int l, h;
+        unsigned long flags;
+        int maxlvt;
+        if (!apic_pm_state.active)
+                return 0;
+        maxlvt = lapic_get_maxlvt();
+        local_irq_save(flags);
+        /*
+         * Make sure the APICBASE points to the right address
+         *
+         * FIXME! This will be wrong if we ever support suspend on
+         * SMP! We'll need to do this as part of the CPU restore!
+         */
+        rdmsr(MSR_IA32_APICBASE, l, h);
+        l &= ~MSR_IA32_APICBASE_BASE;
+        l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+        wrmsr(MSR_IA32_APICBASE, l, h);
+        apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+        apic_write(APIC_ID, apic_pm_state.apic_id);
+        apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+        apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+        apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+        apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+        apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+        apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+        if (maxlvt >= 5)
+                apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+        if (maxlvt >= 4)
+                apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+        apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+        apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+        apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+        apic_write(APIC_ESR, 0);
+        apic_read(APIC_ESR);
+        apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+        apic_write(APIC_ESR, 0);
+        apic_read(APIC_ESR);
+        local_irq_restore(flags);
+        return 0;
+}
+/*
+ * This device has no shutdown method - fully functioning local APICs
+ * are needed on every CPU up until machine_halt/restart/poweroff.
+ */
+static struct sysdev_class lapic_sysclass = {
+        set_kset_name("lapic"),
+        .resume         = lapic_resume,
+        .suspend        = lapic_suspend,
+};
+static struct sys_device device_lapic = {
+        .id     = 0,
+        .cls    = &lapic_sysclass,
+};
+static void __devinit apic_pm_activate(void)
+{
+        apic_pm_state.active = 1;
+}
+static int __init init_lapic_sysfs(void)
+{
+        int error;
+        if (!cpu_has_apic)
+                return 0;
+        /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+        error = sysdev_class_register(&lapic_sysclass);
+        if (!error)
+                error = sysdev_register(&device_lapic);
+        return error;
+}
+device_initcall(init_lapic_sysfs);
+#else   /* CONFIG_PM */
+static void apic_pm_activate(void) { }
+#endif  /* CONFIG_PM */
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
new file mode 100644
index 000000000000..925758dbca0c
--- /dev/null
+++ b/arch/x86/kernel/apic_64.c
@@ -0,0 +1,1253 @@
+/*
+ *      Local APIC handling, local APIC timers
+ *
+ *      (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *      Fixes
+ *      Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
+ *                                      thanks to Eric Gilmore
+ *                                      and Rolf G. Tews
+ *                                      for testing these extensively.
+ *      Maciej W. Rozycki       :       Various updates and fixes.
+ *      Mikael Pettersson       :       Power Management for UP-APIC.
+ *      Pavel Machek and
+ *      Mikael Pettersson       :       PM converted to driver model.
+ */
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/module.h>
+#include <linux/ioport.h>
+#include <asm/atomic.h>
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/mach_apic.h>
+#include <asm/nmi.h>
+#include <asm/idle.h>
+#include <asm/proto.h>
+#include <asm/timex.h>
+#include <asm/hpet.h>
+#include <asm/apic.h>
+int apic_mapped;
+int apic_verbosity;
+int apic_runs_main_timer;
+int apic_calibrate_pmtmr __initdata;
+int disable_apic_timer __initdata;
+/* Local APIC timer works in C2? */
+int local_apic_timer_c2_ok;
+EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
+static struct resource *ioapic_resources;
+static struct resource lapic_resource = {
+        .name = "Local APIC",
+        .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+/*
+ * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
+ * IPIs in place of local APIC timers
+ */
+static cpumask_t timer_interrupt_broadcast_ipi_mask;
+/* Using APIC to generate smp_local_timer_interrupt? */
+int using_apic_timer __read_mostly = 0;
+static void apic_pm_activate(void);
+void apic_wait_icr_idle(void)
+{
+        while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
+                cpu_relax();
+}
+unsigned int safe_apic_wait_icr_idle(void)
+{
+        unsigned int send_status;
+        int timeout;
+        timeout = 0;
+        do {
+                send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+                if (!send_status)
+                        break;
+                udelay(100);
+        } while (timeout++ < 1000);
+        return send_status;
+}
+void enable_NMI_through_LVT0 (void * dummy)
+{
+        unsigned int v;
+        /* unmask and set to NMI */
+        v = APIC_DM_NMI;
+        apic_write(APIC_LVT0, v);
+}
+int get_maxlvt(void)
+{
+        unsigned int v, maxlvt;
+        v = apic_read(APIC_LVR);
+        maxlvt = GET_APIC_MAXLVT(v);
+        return maxlvt;
+}
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+        printk("unexpected IRQ trap at vector %02x\n", irq);
+        /*
+         * Currently unexpected vectors happen only on SMP and APIC.
+         * We _must_ ack these because every local APIC has only N
+         * irq slots per priority level, and a 'hanging, unacked' IRQ
+         * holds up an irq slot - in excessive cases (when multiple
+         * unexpected vectors occur) that might lock up the APIC
+         * completely.
+         * But don't ack when the APIC is disabled. -AK
+         */
+        if (!disable_apic)
+                ack_APIC_irq();
+}
+void clear_local_APIC(void)
+{
+        int maxlvt;
+        unsigned int v;
+        maxlvt = get_maxlvt();
+        /*
+         * Masking an LVT entry can trigger a local APIC error
+         * if the vector is zero. Mask LVTERR first to prevent this.
+         */
+        if (maxlvt >= 3) {
+                v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
+                apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+        }
+        /*
+         * Careful: we have to set masks only first to deassert
+         * any level-triggered sources.
+         */
+        v = apic_read(APIC_LVTT);
+        apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
+        v = apic_read(APIC_LVT0);
+        apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+        v = apic_read(APIC_LVT1);
+        apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
+        if (maxlvt >= 4) {
+                v = apic_read(APIC_LVTPC);
+                apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
+        }
+        /*
+         * Clean APIC state for other OSs:
+         */
+        apic_write(APIC_LVTT, APIC_LVT_MASKED);
+        apic_write(APIC_LVT0, APIC_LVT_MASKED);
+        apic_write(APIC_LVT1, APIC_LVT_MASKED);
+        if (maxlvt >= 3)
+                apic_write(APIC_LVTERR, APIC_LVT_MASKED);
+        if (maxlvt >= 4)
+                apic_write(APIC_LVTPC, APIC_LVT_MASKED);
+        apic_write(APIC_ESR, 0);
+        apic_read(APIC_ESR);
+}
+void disconnect_bsp_APIC(int virt_wire_setup)
+{
+        /* Go back to Virtual Wire compatibility mode */
+        unsigned long value;
+        /* For the spurious interrupt use vector F, and enable it */
+        value = apic_read(APIC_SPIV);
+        value &= ~APIC_VECTOR_MASK;
+        value |= APIC_SPIV_APIC_ENABLED;
+        value |= 0xf;
+        apic_write(APIC_SPIV, value);
+        if (!virt_wire_setup) {
+                /* For LVT0 make it edge triggered, active high, external and enabled */
+                value = apic_read(APIC_LVT0);
+                value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+                        APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                        APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+                value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+                value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+                apic_write(APIC_LVT0, value);
+        } else {
+                /* Disable LVT0 */
+                apic_write(APIC_LVT0, APIC_LVT_MASKED);
+        }
+        /* For LVT1 make it edge triggered, active high, nmi and enabled */
+        value = apic_read(APIC_LVT1);
+        value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+                        APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                        APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+        value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+        value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+        apic_write(APIC_LVT1, value);
+}
+void disable_local_APIC(void)
+{
+        unsigned int value;
+        clear_local_APIC();
+        /*
+         * Disable APIC (implies clearing of registers
+         * for 82489DX!).
+         */
+        value = apic_read(APIC_SPIV);
+        value &= ~APIC_SPIV_APIC_ENABLED;
+        apic_write(APIC_SPIV, value);
+}
+/*
+ * This is to verify that we're looking at a real local APIC.
+ * Check these against your board if the CPUs aren't getting
+ * started for no apparent reason.
+ */
+int __init verify_local_APIC(void)
+{
+        unsigned int reg0, reg1;
+        /*
+         * The version register is read-only in a real APIC.
+         */
+        reg0 = apic_read(APIC_LVR);
+        apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
+        apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
+        reg1 = apic_read(APIC_LVR);
+        apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
+        /*
+         * The two version reads above should print the same
+         * numbers.  If the second one is different, then we
+         * poke at a non-APIC.
+         */
+        if (reg1 != reg0)
+                return 0;
+        /*
+         * Check if the version looks reasonably.
+         */
+        reg1 = GET_APIC_VERSION(reg0);
+        if (reg1 == 0x00 || reg1 == 0xff)
+                return 0;
+        reg1 = get_maxlvt();
+        if (reg1 < 0x02 || reg1 == 0xff)
+                return 0;
+        /*
+         * The ID register is read/write in a real APIC.
+         */
+        reg0 = apic_read(APIC_ID);
+        apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+        apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
+        reg1 = apic_read(APIC_ID);
+        apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
+        apic_write(APIC_ID, reg0);
+        if (reg1 != (reg0 ^ APIC_ID_MASK))
+                return 0;
+        /*
+         * The next two are just to see if we have sane values.
+         * They're only really relevant if we're in Virtual Wire
+         * compatibility mode, but most boxes are anymore.
+         */
+        reg0 = apic_read(APIC_LVT0);
+        apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0);
+        reg1 = apic_read(APIC_LVT1);
+        apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
+        return 1;
+}
+void __init sync_Arb_IDs(void)
+{
+        /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
+        unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+        if (ver >= 0x14)        /* P4 or higher */
+                return;
+        /*
+         * Wait for idle.
+         */
+        apic_wait_icr_idle();
+        apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
+        apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
+                                | APIC_DM_INIT);
+}
+/*
+ * An initial setup of the virtual wire mode.
+ */
+void __init init_bsp_APIC(void)
+{
+        unsigned int value;
+        /*
+         * Don't do the setup now if we have a SMP BIOS as the
+         * through-I/O-APIC virtual wire mode might be active.
+         */
+        if (smp_found_config || !cpu_has_apic)
+                return;
+        value = apic_read(APIC_LVR);
+        /*
+         * Do not trust the local APIC being empty at bootup.
+         */
+        clear_local_APIC();
+        /*
+         * Enable APIC.
+         */
+        value = apic_read(APIC_SPIV);
+        value &= ~APIC_VECTOR_MASK;
+        value |= APIC_SPIV_APIC_ENABLED;
+        value |= APIC_SPIV_FOCUS_DISABLED;
+        value |= SPURIOUS_APIC_VECTOR;
+        apic_write(APIC_SPIV, value);
+        /*
+         * Set up the virtual wire mode.
+         */
+        apic_write(APIC_LVT0, APIC_DM_EXTINT);
+        value = APIC_DM_NMI;
+        apic_write(APIC_LVT1, value);
+}
+void __cpuinit setup_local_APIC (void)
+{
+        unsigned int value, maxlvt;
+        int i, j;
+        value = apic_read(APIC_LVR);
+        BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
+        /*
+         * Double-check whether this APIC is really registered.
+         * This is meaningless in clustered apic mode, so we skip it.
+         */
+        if (!apic_id_registered())
+                BUG();
+        /*
+         * Intel recommends to set DFR, LDR and TPR before enabling
+         * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
+         * document number 292116).  So here it goes...
+         */
+        init_apic_ldr();
+        /*
+         * Set Task Priority to 'accept all'. We never change this
+         * later on.
+         */
+        value = apic_read(APIC_TASKPRI);
+        value &= ~APIC_TPRI_MASK;
+        apic_write(APIC_TASKPRI, value);
+        /*
+         * After a crash, we no longer service the interrupts and a pending
+         * interrupt from previous kernel might still have ISR bit set.
+         *
+         * Most probably by now CPU has serviced that pending interrupt and
+         * it might not have done the ack_APIC_irq() because it thought,
+         * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
+         * does not clear the ISR bit and cpu thinks it has already serivced
+         * the interrupt. Hence a vector might get locked. It was noticed
+         * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
+         */
+        for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+                value = apic_read(APIC_ISR + i*0x10);
+                for (j = 31; j >= 0; j--) {
+                        if (value & (1<<j))
+                                ack_APIC_irq();
+                }
+        }
+        /*
+         * Now that we are all set up, enable the APIC
+         */
+        value = apic_read(APIC_SPIV);
+        value &= ~APIC_VECTOR_MASK;
+        /*
+         * Enable APIC
+         */
+        value |= APIC_SPIV_APIC_ENABLED;
+        /* We always use processor focus */
+        /*
+         * Set spurious IRQ vector
+         */
+        value |= SPURIOUS_APIC_VECTOR;
+        apic_write(APIC_SPIV, value);
+        /*
+         * Set up LVT0, LVT1:
+         *
+         * set up through-local-APIC on the BP's LINT0. This is not
+         * strictly necessary in pure symmetric-IO mode, but sometimes
+         * we delegate interrupts to the 8259A.
+         */
+        /*
+         * TODO: set up through-local-APIC from through-I/O-APIC? --macro
+         */
+        value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
+        if (!smp_processor_id() && !value) {
+                value = APIC_DM_EXTINT;
+                apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
+        } else {
+                value = APIC_DM_EXTINT | APIC_LVT_MASKED;
+                apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id());
+        }
+        apic_write(APIC_LVT0, value);
+        /*
+         * only the BP should see the LINT1 NMI signal, obviously.
+         */
+        if (!smp_processor_id())
+                value = APIC_DM_NMI;
+        else
+                value = APIC_DM_NMI | APIC_LVT_MASKED;
+        apic_write(APIC_LVT1, value);
+        {
+                unsigned oldvalue;
+                maxlvt = get_maxlvt();
+                oldvalue = apic_read(APIC_ESR);
+                value = ERROR_APIC_VECTOR;      // enables sending errors
+                apic_write(APIC_LVTERR, value);
+                /*
+                 * spec says clear errors after enabling vector.
+                 */
+                if (maxlvt > 3)
+                        apic_write(APIC_ESR, 0);
+                value = apic_read(APIC_ESR);
+                if (value != oldvalue)
+                        apic_printk(APIC_VERBOSE,
+                        "ESR value after enabling vector: %08x, after %08x\n",
+                        oldvalue, value);
+        }
+        nmi_watchdog_default();
+        setup_apic_nmi_watchdog(NULL);
+        apic_pm_activate();
+}
+#ifdef CONFIG_PM
+static struct {
+        /* 'active' is true if the local APIC was enabled by us and
+           not the BIOS; this signifies that we are also responsible
+           for disabling it before entering apm/acpi suspend */
+        int active;
+        /* r/w apic fields */
+        unsigned int apic_id;
+        unsigned int apic_taskpri;
+        unsigned int apic_ldr;
+        unsigned int apic_dfr;
+        unsigned int apic_spiv;
+        unsigned int apic_lvtt;
+        unsigned int apic_lvtpc;
+        unsigned int apic_lvt0;
+        unsigned int apic_lvt1;
+        unsigned int apic_lvterr;
+        unsigned int apic_tmict;
+        unsigned int apic_tdcr;
+        unsigned int apic_thmr;
+} apic_pm_state;
+static int lapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+        unsigned long flags;
+        int maxlvt;
+        if (!apic_pm_state.active)
+                return 0;
+        maxlvt = get_maxlvt();
+        apic_pm_state.apic_id = apic_read(APIC_ID);
+        apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+        apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+        apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+        apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+        apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+        if (maxlvt >= 4)
+                apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+        apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+        apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+        apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+        apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+        apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+#ifdef CONFIG_X86_MCE_INTEL
+        if (maxlvt >= 5)
+                apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
+        local_irq_save(flags);
+        disable_local_APIC();
+        local_irq_restore(flags);
+        return 0;
+}
+static int lapic_resume(struct sys_device *dev)
+{
+        unsigned int l, h;
+        unsigned long flags;
+        int maxlvt;
+        if (!apic_pm_state.active)
+                return 0;
+        maxlvt = get_maxlvt();
+        local_irq_save(flags);
+        rdmsr(MSR_IA32_APICBASE, l, h);
+        l &= ~MSR_IA32_APICBASE_BASE;
+        l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+        wrmsr(MSR_IA32_APICBASE, l, h);
+        apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+        apic_write(APIC_ID, apic_pm_state.apic_id);
+        apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+        apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+        apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+        apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+        apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+        apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+#ifdef CONFIG_X86_MCE_INTEL
+        if (maxlvt >= 5)
+                apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+        if (maxlvt >= 4)
+                apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+        apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+        apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+        apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+        apic_write(APIC_ESR, 0);
+        apic_read(APIC_ESR);
+        apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+        apic_write(APIC_ESR, 0);
+        apic_read(APIC_ESR);
+        local_irq_restore(flags);
+        return 0;
+}
+static struct sysdev_class lapic_sysclass = {
+        set_kset_name("lapic"),
+        .resume         = lapic_resume,
+        .suspend        = lapic_suspend,
+};
+static struct sys_device device_lapic = {
+        .id             = 0,
+        .cls            = &lapic_sysclass,
+};
+static void __cpuinit apic_pm_activate(void)
+{
+        apic_pm_state.active = 1;
+}
+static int __init init_lapic_sysfs(void)
+{
+        int error;
+        if (!cpu_has_apic)
+                return 0;
+        /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+        error = sysdev_class_register(&lapic_sysclass);
+        if (!error)
+                error = sysdev_register(&device_lapic);
+        return error;
+}
+device_initcall(init_lapic_sysfs);
+#else   /* CONFIG_PM */
+static void apic_pm_activate(void) { }
+#endif  /* CONFIG_PM */
+static int __init apic_set_verbosity(char *str)
+{
+        if (str == NULL)  {
+                skip_ioapic_setup = 0;
+                ioapic_force = 1;
+                return 0;
+        }
+        if (strcmp("debug", str) == 0)
+                apic_verbosity = APIC_DEBUG;
+        else if (strcmp("verbose", str) == 0)
+                apic_verbosity = APIC_VERBOSE;
+        else {
+                printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+                                " use apic=verbose or apic=debug\n", str);
+                return -EINVAL;
+        }
+        return 0;
+}
+early_param("apic", apic_set_verbosity);
+/*
+ * Detect and enable local APICs on non-SMP boards.
+ * Original code written by Keir Fraser.
+ * On AMD64 we trust the BIOS - if it says no APIC it is likely
+ * not correctly set up (usually the APIC timer won't work etc.)
+ */
+static int __init detect_init_APIC (void)
+{
+        if (!cpu_has_apic) {
+                printk(KERN_INFO "No local APIC present\n");
+                return -1;
+        }
+        mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+        boot_cpu_id = 0;
+        return 0;
+}
+#ifdef CONFIG_X86_IO_APIC
+static struct resource * __init ioapic_setup_resources(void)
+{
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+        unsigned long n;
+        struct resource *res;
+        char *mem;
+        int i;
+        if (nr_ioapics <= 0)
+                return NULL;
+        n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+        n *= nr_ioapics;
+        mem = alloc_bootmem(n);
+        res = (void *)mem;
+        if (mem != NULL) {
+                memset(mem, 0, n);
+                mem += sizeof(struct resource) * nr_ioapics;
+                for (i = 0; i < nr_ioapics; i++) {
+                        res[i].name = mem;
+                        res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+                        sprintf(mem,  "IOAPIC %u", i);
+                        mem += IOAPIC_RESOURCE_NAME_SIZE;
+                }
+        }
+        ioapic_resources = res;
+        return res;
+}
+static int __init ioapic_insert_resources(void)
+{
+        int i;
+        struct resource *r = ioapic_resources;
+        if (!r) {
+                printk("IO APIC resources could be not be allocated.\n");
+                return -1;
+        }
+        for (i = 0; i < nr_ioapics; i++) {
+                insert_resource(&iomem_resource, r);
+                r++;
+        }
+        return 0;
+}
+/* Insert the IO APIC resources after PCI initialization has occured to handle
+ * IO APICS that are mapped in on a BAR in PCI space. */
+late_initcall(ioapic_insert_resources);
+#endif
+void __init init_apic_mappings(void)
+{
+        unsigned long apic_phys;
+        /*
+         * If no local APIC can be found then set up a fake all
+         * zeroes page to simulate the local APIC and another
+         * one for the IO-APIC.
+         */
+        if (!smp_found_config && detect_init_APIC()) {
+                apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
+                apic_phys = __pa(apic_phys);
+        } else
+                apic_phys = mp_lapic_addr;
+        set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+        apic_mapped = 1;
+        apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
+        /* Put local APIC into the resource map. */
+        lapic_resource.start = apic_phys;
+        lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
+        insert_resource(&iomem_resource, &lapic_resource);
+        /*
+         * Fetch the APIC ID of the BSP in case we have a
+         * default configuration (or the MP table is broken).
+         */
+        boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
+        {
+                unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+                int i;
+                struct resource *ioapic_res;
+                ioapic_res = ioapic_setup_resources();
+                for (i = 0; i < nr_ioapics; i++) {
+                        if (smp_found_config) {
+                                ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+                        } else {
+                                ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
+                                ioapic_phys = __pa(ioapic_phys);
+                        }
+                        set_fixmap_nocache(idx, ioapic_phys);
+                        apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
+                                        __fix_to_virt(idx), ioapic_phys);
+                        idx++;
+                        if (ioapic_res != NULL) {
+                                ioapic_res->start = ioapic_phys;
+                                ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+                                ioapic_res++;
+                        }
+                }
+        }
+}
+/*
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
+ */
+#define APIC_DIVISOR 16
+static void __setup_APIC_LVTT(unsigned int clocks)
+{
+        unsigned int lvtt_value, tmp_value;
+        int cpu = smp_processor_id();
+        lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
+        if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask))
+                lvtt_value |= APIC_LVT_MASKED;
+        apic_write(APIC_LVTT, lvtt_value);
+        /*
+         * Divide PICLK by 16
+         */
+        tmp_value = apic_read(APIC_TDCR);
+        apic_write(APIC_TDCR, (tmp_value
+                                & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+                                | APIC_TDR_DIV_16);
+        apic_write(APIC_TMICT, clocks/APIC_DIVISOR);
+}
+static void setup_APIC_timer(unsigned int clocks)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        /* wait for irq slice */
+        if (hpet_address && hpet_use_timer) {
+                u32 trigger = hpet_readl(HPET_T0_CMP);
+                while (hpet_readl(HPET_T0_CMP) == trigger)
+                        /* do nothing */ ;
+        } else {
+                int c1, c2;
+                outb_p(0x00, 0x43);
+                c2 = inb_p(0x40);
+                c2 |= inb_p(0x40) << 8;
+                do {
+                        c1 = c2;
+                        outb_p(0x00, 0x43);
+                        c2 = inb_p(0x40);
+                        c2 |= inb_p(0x40) << 8;
+                } while (c2 - c1 < 300);
+        }
+        __setup_APIC_LVTT(clocks);
+        /* Turn off PIT interrupt if we use APIC timer as main timer.
+           Only works with the PM timer right now
+           TBD fix it for HPET too. */
+        if ((pmtmr_ioport != 0) &&
+                smp_processor_id() == boot_cpu_id &&
+                apic_runs_main_timer == 1 &&
+                !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) {
+                stop_timer_interrupt();
+                apic_runs_main_timer++;
+        }
+        local_irq_restore(flags);
+}
+/*
+ * In this function we calibrate APIC bus clocks to the external
+ * timer. Unfortunately we cannot use jiffies and the timer irq
+ * to calibrate, since some later bootup code depends on getting
+ * the first irq? Ugh.
+ *
+ * We want to do the calibration only once since we
+ * want to have local timer irqs syncron. CPUs connected
+ * by the same APIC bus have the very same bus frequency.
+ * And we want to have irqs off anyways, no accidental
+ * APIC irq that way.
+ */
+#define TICK_COUNT 100000000
+static int __init calibrate_APIC_clock(void)
+{
+        unsigned apic, apic_start;
+        unsigned long tsc, tsc_start;
+        int result;
+        /*
+         * Put whatever arbitrary (but long enough) timeout
+         * value into the APIC clock, we just want to get the
+         * counter running for calibration.
+         */
+        __setup_APIC_LVTT(4000000000);
+        apic_start = apic_read(APIC_TMCCT);
+#ifdef CONFIG_X86_PM_TIMER
+        if (apic_calibrate_pmtmr && pmtmr_ioport) {
+                pmtimer_wait(5000);  /* 5ms wait */
+                apic = apic_read(APIC_TMCCT);
+                result = (apic_start - apic) * 1000L / 5;
+        } else
+#endif
+        {
+                rdtscll(tsc_start);
+                do {
+                        apic = apic_read(APIC_TMCCT);
+                        rdtscll(tsc);
+                } while ((tsc - tsc_start) < TICK_COUNT &&
+                                (apic_start - apic) < TICK_COUNT);
+                result = (apic_start - apic) * 1000L * tsc_khz /
+                                        (tsc - tsc_start);
+        }
+        printk("result %d\n", result);
+        printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
+                result / 1000 / 1000, result / 1000 % 1000);
+        return result * APIC_DIVISOR / HZ;
+}
+static unsigned int calibration_result;
+void __init setup_boot_APIC_clock (void)
+{
+        if (disable_apic_timer) {
+                printk(KERN_INFO "Disabling APIC timer\n");
+                return;
+        }
+        printk(KERN_INFO "Using local APIC timer interrupts.\n");
+        using_apic_timer = 1;
+        local_irq_disable();
+        calibration_result = calibrate_APIC_clock();
+        /*
+         * Now set up the timer for real.
+         */
+        setup_APIC_timer(calibration_result);
+        local_irq_enable();
+}
+void __cpuinit setup_secondary_APIC_clock(void)
+{
+        local_irq_disable(); /* FIXME: Do we need this? --RR */
+        setup_APIC_timer(calibration_result);
+        local_irq_enable();
+}
+void disable_APIC_timer(void)
+{
+        if (using_apic_timer) {
+                unsigned long v;
+                v = apic_read(APIC_LVTT);
+                /*
+                 * When an illegal vector value (0-15) is written to an LVT
+                 * entry and delivery mode is Fixed, the APIC may signal an
+                 * illegal vector error, with out regard to whether the mask
+                 * bit is set or whether an interrupt is actually seen on input.
+                 *
+                 * Boot sequence might call this function when the LVTT has
+                 * '0' vector value. So make sure vector field is set to
+                 * valid value.
+                 */
+                v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+                apic_write(APIC_LVTT, v);
+        }
+}
+void enable_APIC_timer(void)
+{
+        int cpu = smp_processor_id();
+        if (using_apic_timer &&
+            !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
+                unsigned long v;
+                v = apic_read(APIC_LVTT);
+                apic_write(APIC_LVTT, v & ~APIC_LVT_MASKED);
+        }
+}
+void switch_APIC_timer_to_ipi(void *cpumask)
+{
+        cpumask_t mask = *(cpumask_t *)cpumask;
+        int cpu = smp_processor_id();
+        if (cpu_isset(cpu, mask) &&
+            !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
+                disable_APIC_timer();
+                cpu_set(cpu, timer_interrupt_broadcast_ipi_mask);
+        }
+}
+EXPORT_SYMBOL(switch_APIC_timer_to_ipi);
+void smp_send_timer_broadcast_ipi(void)
+{
+        int cpu = smp_processor_id();
+        cpumask_t mask;
+        cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask);
+        if (cpu_isset(cpu, mask)) {
+                cpu_clear(cpu, mask);
+                add_pda(apic_timer_irqs, 1);
+                smp_local_timer_interrupt();
+        }
+        if (!cpus_empty(mask)) {
+                send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+        }
+}
+void switch_ipi_to_APIC_timer(void *cpumask)
+{
+        cpumask_t mask = *(cpumask_t *)cpumask;
+        int cpu = smp_processor_id();
+        if (cpu_isset(cpu, mask) &&
+            cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
+                cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask);
+                enable_APIC_timer();
+        }
+}
+EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
+int setup_profiling_timer(unsigned int multiplier)
+{
+        return -EINVAL;
+}
+void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
+                             unsigned char msg_type, unsigned char mask)
+{
+        unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
+        unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
+        apic_write(reg, v);
+}
+#undef APIC_DIVISOR
+/*
+ * Local timer interrupt handler. It does both profiling and
+ * process statistics/rescheduling.
+ *
+ * We do profiling in every local tick, statistics/rescheduling
+ * happen only every 'profiling multiplier' ticks. The default
+ * multiplier is 1 and it can be changed by writing the new multiplier
+ * value into /proc/profile.
+ */
+void smp_local_timer_interrupt(void)
+{
+        profile_tick(CPU_PROFILING);
+#ifdef CONFIG_SMP
+        update_process_times(user_mode(get_irq_regs()));
+#endif
+        if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id)
+                main_timer_handler();
+        /*
+         * We take the 'long' return path, and there every subsystem
+         * grabs the appropriate locks (kernel lock/ irq lock).
+         *
+         * We might want to decouple profiling from the 'long path',
+         * and do the profiling totally in assembly.
+         *
+         * Currently this isn't too much of an issue (performance wise),
+         * we can take more than 100K local irqs per second on a 100 MHz P5.
+         */
+}
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ *   interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+void smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+        struct pt_regs *old_regs = set_irq_regs(regs);
+        /*
+         * the NMI deadlock-detector uses this.
+         */
+        add_pda(apic_timer_irqs, 1);
+        /*
+         * NOTE! We'd better ACK the irq immediately,
+         * because timer handling can be slow.
+         */
+        ack_APIC_irq();
+        /*
+         * update_process_times() expects us to have done irq_enter().
+         * Besides, if we don't timer interrupts ignore the global
+         * interrupt lock, which is the WrongThing (tm) to do.
+         */
+        exit_idle();
+        irq_enter();
+        smp_local_timer_interrupt();
+        irq_exit();
+        set_irq_regs(old_regs);
+}
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ *
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis. Use available data to take a good guess.
+ * If in doubt, go HPET.
+ */
+__cpuinit int apic_is_clustered_box(void)
+{
+        int i, clusters, zeros;
+        unsigned id;
+        DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
+        bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
+        for (i = 0; i < NR_CPUS; i++) {
+                id = bios_cpu_apicid[i];
+                if (id != BAD_APICID)
+                        __set_bit(APIC_CLUSTERID(id), clustermap);
+        }
+        /* Problem:  Partially populated chassis may not have CPUs in some of
+         * the APIC clusters they have been allocated.  Only present CPUs have
+         * bios_cpu_apicid entries, thus causing zeroes in the bitmap.  Since
+         * clusters are allocated sequentially, count zeros only if they are
+         * bounded by ones.
+         */
+        clusters = 0;
+        zeros = 0;
+        for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+                if (test_bit(i, clustermap)) {
+                        clusters += 1 + zeros;
+                        zeros = 0;
+                } else
+                        ++zeros;
+        }
+        /*
+         * If clusters > 2, then should be multi-chassis.
+         * May have to revisit this when multi-core + hyperthreaded CPUs come
+         * out, but AFAIK this will work even for them.
+         */
+        return (clusters > 2);
+}
+/*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+asmlinkage void smp_spurious_interrupt(void)
+{
+        unsigned int v;
+        exit_idle();
+        irq_enter();
+        /*
+         * Check if this really is a spurious interrupt and ACK it
+         * if it is a vectored one.  Just in case...
+         * Spurious interrupts should not be ACKed.
+         */
+        v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
+        if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
+                ack_APIC_irq();
+        irq_exit();
+}
+/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+asmlinkage void smp_error_interrupt(void)
+{
+        unsigned int v, v1;
+        exit_idle();
+        irq_enter();
+        /* First tickle the hardware, only then report what went on. -- REW */
+        v = apic_read(APIC_ESR);
+        apic_write(APIC_ESR, 0);
+        v1 = apic_read(APIC_ESR);
+        ack_APIC_irq();
+        atomic_inc(&irq_err_count);
+        /* Here is what the APIC error bits mean:
+           0: Send CS error
+           1: Receive CS error
+           2: Send accept error
+           3: Receive accept error
+           4: Reserved
+           5: Send illegal vector
+           6: Received illegal vector
+           7: Illegal register address
+        */
+        printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
+                smp_processor_id(), v , v1);
+        irq_exit();
+}
+int disable_apic;
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor (void)
+{
+        if (disable_apic) {
+                printk(KERN_INFO "Apic disabled\n");
+                return -1;
+        }
+        if (!cpu_has_apic) {
+                disable_apic = 1;
+                printk(KERN_INFO "Apic disabled by BIOS\n");
+                return -1;
+        }
+        verify_local_APIC();
+        phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
+        apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
+        setup_local_APIC();
+        if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+                setup_IO_APIC();
+        else
+                nr_ioapics = 0;
+        setup_boot_APIC_clock();
+        check_nmi_watchdog();
+        return 0;
+}
+static __init int setup_disableapic(char *str)
+{
+        disable_apic = 1;
+        clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+        return 0;
+}
+early_param("disableapic", setup_disableapic);
+/* same as disableapic, for compatibility */
+static __init int setup_nolapic(char *str)
+{
+        return setup_disableapic(str);
+}
+early_param("nolapic", setup_nolapic);
+static int __init parse_lapic_timer_c2_ok(char *arg)
+{
+        local_apic_timer_c2_ok = 1;
+        return 0;
+}
+early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
+static __init int setup_noapictimer(char *str)
+{
+        if (str[0] != ' ' && str[0] != 0)
+                return 0;
+        disable_apic_timer = 1;
+        return 1;
+}
+static __init int setup_apicmaintimer(char *str)
+{
+        apic_runs_main_timer = 1;
+        nohpet = 1;
+        return 1;
+}
+__setup("apicmaintimer", setup_apicmaintimer);
+static __init int setup_noapicmaintimer(char *str)
+{
+        apic_runs_main_timer = -1;
+        return 1;
+}
+__setup("noapicmaintimer", setup_noapicmaintimer);
+static __init int setup_apicpmtimer(char *s)
+{
+        apic_calibrate_pmtmr = 1;
+        notsc_setup(NULL);
+        return setup_apicmaintimer(NULL);
+}
+__setup("apicpmtimer", setup_apicpmtimer);
+__setup("noapictimer", setup_noapictimer);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
new file mode 100644
index 000000000000..32f2365c26ed
--- /dev/null
+++ b/arch/x86/kernel/apm_32.c
@@ -0,0 +1,2403 @@
+/* -*- linux-c -*-
+ * APM BIOS driver for Linux
+ * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au)
+ *
+ * Initial development of this driver was funded by NEC Australia P/L
+ *      and NEC Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * October 1995, Rik Faith (faith@cs.unc.edu):
+ *    Minor enhancements and updates (to the patch set) for 1.3.x
+ *    Documentation
+ * January 1996, Rik Faith (faith@cs.unc.edu):
+ *    Make /proc/apm easy to format (bump driver version)
+ * March 1996, Rik Faith (faith@cs.unc.edu):
+ *    Prohibit APM BIOS calls unless apm_enabled.
+ *    (Thanks to Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>)
+ * April 1996, Stephen Rothwell (sfr@canb.auug.org.au)
+ *    Version 1.0 and 1.1
+ * May 1996, Version 1.2
+ * Feb 1998, Version 1.3
+ * Feb 1998, Version 1.4
+ * Aug 1998, Version 1.5
+ * Sep 1998, Version 1.6
+ * Nov 1998, Version 1.7
+ * Jan 1999, Version 1.8
+ * Jan 1999, Version 1.9
+ * Oct 1999, Version 1.10
+ * Nov 1999, Version 1.11
+ * Jan 2000, Version 1.12
+ * Feb 2000, Version 1.13
+ * Nov 2000, Version 1.14
+ * Oct 2001, Version 1.15
+ * Jan 2002, Version 1.16
+ * Oct 2002, Version 1.16ac
+ *
+ * History:
+ *    0.6b: first version in official kernel, Linux 1.3.46
+ *    0.7: changed /proc/apm format, Linux 1.3.58
+ *    0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59
+ *    0.9: only call bios if bios is present, Linux 1.3.72
+ *    1.0: use fixed device number, consolidate /proc/apm into this file,
+ *         Linux 1.3.85
+ *    1.1: support user-space standby and suspend, power off after system
+ *         halted, Linux 1.3.98
+ *    1.2: When resetting RTC after resume, take care so that the time
+ *         is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth
+ *         <jtoth@princeton.edu>); improve interaction between
+ *         screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4
+ *    1.2a:Simple change to stop mysterious bug reports with SMP also added
+ *         levels to the printk calls. APM is not defined for SMP machines.
+ *         The new replacment for it is, but Linux doesn't yet support this.
+ *         Alan Cox Linux 2.1.55
+ *    1.3: Set up a valid data descriptor 0x40 for buggy BIOS's
+ *    1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by
+ *         Dean Gaudet <dgaudet@arctic.org>.
+ *         C. Scott Ananian <cananian@alumni.princeton.edu> Linux 2.1.87
+ *    1.5: Fix segment register reloading (in case of bad segments saved
+ *         across BIOS call).
+ *         Stephen Rothwell
+ *    1.6: Cope with complier/assembler differences.
+ *         Only try to turn off the first display device.
+ *         Fix OOPS at power off with no APM BIOS by Jan Echternach
+ *                   <echter@informatik.uni-rostock.de>
+ *         Stephen Rothwell
+ *    1.7: Modify driver's cached copy of the disabled/disengaged flags
+ *         to reflect current state of APM BIOS.
+ *         Chris Rankin <rankinc@bellsouth.net>
+ *         Reset interrupt 0 timer to 100Hz after suspend
+ *         Chad Miller <cmiller@surfsouth.com>
+ *         Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE
+ *         Richard Gooch <rgooch@atnf.csiro.au>
+ *         Allow boot time disabling of APM
+ *         Make boot messages far less verbose by default
+ *         Make asm safer
+ *         Stephen Rothwell
+ *    1.8: Add CONFIG_APM_RTC_IS_GMT
+ *         Richard Gooch <rgooch@atnf.csiro.au>
+ *         change APM_NOINTS to CONFIG_APM_ALLOW_INTS
+ *         remove dependency on CONFIG_PROC_FS
+ *         Stephen Rothwell
+ *    1.9: Fix small typo.  <laslo@wodip.opole.pl>
+ *         Try to cope with BIOS's that need to have all display
+ *         devices blanked and not just the first one.
+ *         Ross Paterson <ross@soi.city.ac.uk>
+ *         Fix segment limit setting it has always been wrong as
+ *         the segments needed to have byte granularity.
+ *         Mark a few things __init.
+ *         Add hack to allow power off of SMP systems by popular request.
+ *         Use CONFIG_SMP instead of __SMP__
+ *         Ignore BOUNCES for three seconds.
+ *         Stephen Rothwell
+ *   1.10: Fix for Thinkpad return code.
+ *         Merge 2.2 and 2.3 drivers.
+ *         Remove APM dependencies in arch/i386/kernel/process.c
+ *         Remove APM dependencies in drivers/char/sysrq.c
+ *         Reset time across standby.
+ *         Allow more inititialisation on SMP.
+ *         Remove CONFIG_APM_POWER_OFF and make it boot time
+ *         configurable (default on).
+ *         Make debug only a boot time parameter (remove APM_DEBUG).
+ *         Try to blank all devices on any error.
+ *   1.11: Remove APM dependencies in drivers/char/console.c
+ *         Check nr_running to detect if we are idle (from
+ *         Borislav Deianov <borislav@lix.polytechnique.fr>)
+ *         Fix for bioses that don't zero the top part of the
+ *         entrypoint offset (Mario Sitta <sitta@al.unipmn.it>)
+ *         (reported by Panos Katsaloulis <teras@writeme.com>).
+ *         Real mode power off patch (Walter Hofmann
+ *         <Walter.Hofmann@physik.stud.uni-erlangen.de>).
+ *   1.12: Remove CONFIG_SMP as the compiler will optimize
+ *         the code away anyway (smp_num_cpus == 1 in UP)
+ *         noted by Artur Skawina <skawina@geocities.com>.
+ *         Make power off under SMP work again.
+ *         Fix thinko with initial engaging of BIOS.
+ *         Make sure power off only happens on CPU 0
+ *         (Paul "Rusty" Russell <rusty@rustcorp.com.au>).
+ *         Do error notification to user mode if BIOS calls fail.
+ *         Move entrypoint offset fix to ...boot/setup.S
+ *         where it belongs (Cosmos <gis88564@cis.nctu.edu.tw>).
+ *         Remove smp-power-off. SMP users must now specify
+ *         "apm=power-off" on the kernel command line. Suggested
+ *         by Jim Avera <jima@hal.com>, modified by Alan Cox
+ *         <alan@lxorguk.ukuu.org.uk>.
+ *         Register the /proc/apm entry even on SMP so that
+ *         scripts that check for it before doing power off
+ *         work (Jim Avera <jima@hal.com>).
+ *   1.13: Changes for new pm_ interfaces (Andy Henroid
+ *         <andy_henroid@yahoo.com>).
+ *         Modularize the code.
+ *         Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS
+ *         is now the way life works).
+ *         Fix thinko in suspend() (wrong return).
+ *         Notify drivers on critical suspend.
+ *         Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
+ *         modified by sfr).
+ *         Disable interrupts while we are suspended (Andy Henroid
+ *         <andy_henroid@yahoo.com> fixed by sfr).
+ *         Make power off work on SMP again (Tony Hoyle
+ *         <tmh@magenta-logic.com> and <zlatko@iskon.hr>) modified by sfr.
+ *         Remove CONFIG_APM_SUSPEND_BOUNCE.  The bounce ignore
+ *         interval is now configurable.
+ *   1.14: Make connection version persist across module unload/load.
+ *         Enable and engage power management earlier.
+ *         Disengage power management on module unload.
+ *         Changed to use the sysrq-register hack for registering the
+ *         power off function called by magic sysrq based upon discussions
+ *         in irc://irc.openprojects.net/#kernelnewbies
+ *         (Crutcher Dunnavant <crutcher+kernel@datastacks.com>).
+ *         Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable.
+ *         (Arjan van de Ven <arjanv@redhat.com>) modified by sfr.
+ *         Work around byte swap bug in one of the Vaio's BIOS's
+ *         (Marc Boucher <marc@mbsi.ca>).
+ *         Exposed the disable flag to dmi so that we can handle known
+ *         broken APM (Alan Cox <alan@redhat.com>).
+ *   1.14ac: If the BIOS says "I slowed the CPU down" then don't spin
+ *         calling it - instead idle. (Alan Cox <alan@redhat.com>)
+ *         If an APM idle fails log it and idle sensibly
+ *   1.15: Don't queue events to clients who open the device O_WRONLY.
+ *         Don't expect replies from clients who open the device O_RDONLY.
+ *         (Idea from Thomas Hood)
+ *         Minor waitqueue cleanups. (John Fremlin <chief@bandits.org>)
+ *   1.16: Fix idle calling. (Andreas Steinmetz <ast@domdv.de> et al.)
+ *         Notify listeners of standby or suspend events before notifying
+ *         drivers. Return EBUSY to ioctl() if suspend is rejected.
+ *         (Russell King <rmk@arm.linux.org.uk> and Thomas Hood)
+ *         Ignore first resume after we generate our own resume event
+ *         after a suspend (Thomas Hood)
+ *         Daemonize now gets rid of our controlling terminal (sfr).
+ *         CONFIG_APM_CPU_IDLE now just affects the default value of
+ *         idle_threshold (sfr).
+ *         Change name of kernel apm daemon (as it no longer idles) (sfr).
+ *   1.16ac: Fix up SMP support somewhat. You can now force SMP on and we
+ *         make _all_ APM calls on the CPU#0. Fix unsafe sign bug.
+ *         TODO: determine if its "boot CPU" or "CPU0" we want to lock to.
+ *
+ * APM 1.1 Reference:
+ *
+ *   Intel Corporation, Microsoft Corporation. Advanced Power Management
+ *   (APM) BIOS Interface Specification, Revision 1.1, September 1993.
+ *   Intel Order Number 241704-001.  Microsoft Part Number 781-110-X01.
+ *
+ * [This document is available free from Intel by calling 800.628.8686 (fax
+ * 916.356.6100) or 800.548.4725; or via anonymous ftp from
+ * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc.  It is also
+ * available from Microsoft by calling 206.882.8080.]
+ *
+ * APM 1.2 Reference:
+ *   Intel Corporation, Microsoft Corporation. Advanced Power Management
+ *   (APM) BIOS Interface Specification, Revision 1.2, February 1996.
+ *
+ * [This document is available from Microsoft at:
+ *    http://www.microsoft.com/whdc/archive/amp_12.mspx]
+ */
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/timer.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+#include <linux/apm_bios.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <linux/pm.h>
+#include <linux/pm_legacy.h>
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/freezer.h>
+#include <linux/smp.h>
+#include <linux/dmi.h>
+#include <linux/suspend.h>
+#include <linux/kthread.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+#include <asm/i8253.h>
+#include <asm/paravirt.h>
+#include <asm/reboot.h>
+#include "io_ports.h"
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+extern int (*console_blank_hook)(int);
+#endif
+/*
+ * The apm_bios device is one of the misc char devices.
+ * This is its minor number.
+ */
+#define APM_MINOR_DEV   134
+/*
+ * See Documentation/Config.help for the configuration options.
+ *
+ * Various options can be changed at boot time as follows:
+ * (We allow underscores for compatibility with the modules code)
+ *      apm=on/off                      enable/disable APM
+ *          [no-]allow[-_]ints          allow interrupts during BIOS calls
+ *          [no-]broken[-_]psr          BIOS has a broken GetPowerStatus call
+ *          [no-]realmode[-_]power[-_]off       switch to real mode before
+ *                                              powering off
+ *          [no-]debug                  log some debugging messages
+ *          [no-]power[-_]off           power off on shutdown
+ *          [no-]smp                    Use apm even on an SMP box
+ *          bounce[-_]interval=<n>      number of ticks to ignore suspend
+ *                                      bounces
+ *          idle[-_]threshold=<n>       System idle percentage above which to
+ *                                      make APM BIOS idle calls. Set it to
+ *                                      100 to disable.
+ *          idle[-_]period=<n>          Period (in 1/100s of a second) over
+ *                                      which the idle percentage is
+ *                                      calculated.
+ */
+/* KNOWN PROBLEM MACHINES:
+ *
+ * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant
+ *                         [Confirmed by TI representative]
+ * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification
+ *                    [Confirmed by BIOS disassembly]
+ *                    [This may work now ...]
+ * P: Toshiba 1950S: battery life information only gets updated after resume
+ * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking
+ *      broken in BIOS [Reported by Garst R. Reese <reese@isn.net>]
+ * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP
+ *      Neale Banks <neale@lowendale.com.au> December 2000
+ *
+ * Legend: U = unusable with APM patches
+ *         P = partially usable with APM patches
+ */
+/*
+ * Define as 1 to make the driver always call the APM BIOS busy
+ * routine even if the clock was not reported as slowed by the
+ * idle routine.  Otherwise, define as 0.
+ */
+#define ALWAYS_CALL_BUSY   1
+/*
+ * Define to make the APM BIOS calls zero all data segment registers (so
+ * that an incorrect BIOS implementation will cause a kernel panic if it
+ * tries to write to arbitrary memory).
+ */
+#define APM_ZERO_SEGS
+#include "apm.h"
+/*
+ * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
+ * This patched by Chad Miller <cmiller@surfsouth.com>, original code by
+ * David Chen <chen@ctpa04.mit.edu>
+ */
+#undef INIT_TIMER_AFTER_SUSPEND
+#ifdef INIT_TIMER_AFTER_SUSPEND
+#include <linux/timex.h>
+#include <asm/io.h>
+#include <linux/delay.h>
+#endif
+/*
+ * Need to poll the APM BIOS every second
+ */
+#define APM_CHECK_TIMEOUT       (HZ)
+/*
+ * Ignore suspend events for this amount of time after a resume
+ */
+#define DEFAULT_BOUNCE_INTERVAL         (3 * HZ)
+/*
+ * Maximum number of events stored
+ */
+#define APM_MAX_EVENTS          20
+/*
+ * The per-file APM data
+ */
+struct apm_user {
+        int             magic;
+        struct apm_user *       next;
+        unsigned int    suser: 1;
+        unsigned int    writer: 1;
+        unsigned int    reader: 1;
+        unsigned int    suspend_wait: 1;
+        int             suspend_result;
+        int             suspends_pending;
+        int             standbys_pending;
+        int             suspends_read;
+        int             standbys_read;
+        int             event_head;
+        int             event_tail;
+        apm_event_t     events[APM_MAX_EVENTS];
+};
+/*
+ * The magic number in apm_user
+ */
+#define APM_BIOS_MAGIC          0x4101
+/*
+ * idle percentage above which bios idle calls are done
+ */
+#ifdef CONFIG_APM_CPU_IDLE
+#define DEFAULT_IDLE_THRESHOLD  95
+#else
+#define DEFAULT_IDLE_THRESHOLD  100
+#endif
+#define DEFAULT_IDLE_PERIOD     (100 / 3)
+/*
+ * Local variables
+ */
+static struct {
+        unsigned long   offset;
+        unsigned short  segment;
+}                               apm_bios_entry;
+static int                      clock_slowed;
+static int                      idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
+static int                      idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
+static int                      set_pm_idle;
+static int                      suspends_pending;
+static int                      standbys_pending;
+static int                      ignore_sys_suspend;
+static int                      ignore_normal_resume;
+static int                      bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
+static int                      debug __read_mostly;
+static int                      smp __read_mostly;
+static int                      apm_disabled = -1;
+#ifdef CONFIG_SMP
+static int                      power_off;
+#else
+static int                      power_off = 1;
+#endif
+#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
+static int                      realmode_power_off = 1;
+#else
+static int                      realmode_power_off;
+#endif
+#ifdef CONFIG_APM_ALLOW_INTS
+static int                      allow_ints = 1;
+#else
+static int                      allow_ints;
+#endif
+static int                      broken_psr;
+static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
+static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
+static struct apm_user *        user_list;
+static DEFINE_SPINLOCK(user_list_lock);
+static const struct desc_struct bad_bios_desc = { 0, 0x00409200 };
+static const char               driver_version[] = "1.16ac";    /* no spaces */
+static struct task_struct *kapmd_task;
+/*
+ *      APM event names taken from the APM 1.2 specification. These are
+ *      the message codes that the BIOS uses to tell us about events
+ */
+static const char *     const apm_event_name[] = {
+        "system standby",
+        "system suspend",
+        "normal resume",
+        "critical resume",
+        "low battery",
+        "power status change",
+        "update time",
+        "critical suspend",
+        "user standby",
+        "user suspend",
+        "system standby resume",
+        "capabilities change"
+};
+#define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name)
+typedef struct lookup_t {
+        int     key;
+        char *  msg;
+} lookup_t;
+/*
+ *      The BIOS returns a set of standard error codes in AX when the
+ *      carry flag is set.
+ */
+ 
+static const lookup_t error_table[] = {
+/* N/A  { APM_SUCCESS,          "Operation succeeded" }, */
+        { APM_DISABLED,         "Power management disabled" },
+        { APM_CONNECTED,        "Real mode interface already connected" },
+        { APM_NOT_CONNECTED,    "Interface not connected" },
+        { APM_16_CONNECTED,     "16 bit interface already connected" },
+/* N/A  { APM_16_UNSUPPORTED,   "16 bit interface not supported" }, */
+        { APM_32_CONNECTED,     "32 bit interface already connected" },
+        { APM_32_UNSUPPORTED,   "32 bit interface not supported" },
+        { APM_BAD_DEVICE,       "Unrecognized device ID" },
+        { APM_BAD_PARAM,        "Parameter out of range" },
+        { APM_NOT_ENGAGED,      "Interface not engaged" },
+        { APM_BAD_FUNCTION,     "Function not supported" },
+        { APM_RESUME_DISABLED,  "Resume timer disabled" },
+        { APM_BAD_STATE,        "Unable to enter requested state" },
+/* N/A  { APM_NO_EVENTS,        "No events pending" }, */
+        { APM_NO_ERROR,         "BIOS did not set a return code" },
+        { APM_NOT_PRESENT,      "No APM present" }
+};
+#define ERROR_COUNT     ARRAY_SIZE(error_table)
+/**
+ *      apm_error       -       display an APM error
+ *      @str: information string
+ *      @err: APM BIOS return code
+ *
+ *      Write a meaningful log entry to the kernel log in the event of
+ *      an APM error.
+ */
+ 
+static void apm_error(char *str, int err)
+{
+        int     i;
+        for (i = 0; i < ERROR_COUNT; i++)
+                if (error_table[i].key == err) break;
+        if (i < ERROR_COUNT)
+                printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
+        else
+                printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
+                        str, err);
+}
+/*
+ * Lock APM functionality to physical CPU 0
+ */
+ 
+#ifdef CONFIG_SMP
+static cpumask_t apm_save_cpus(void)
+{
+        cpumask_t x = current->cpus_allowed;
+        /* Some bioses don't like being called from CPU != 0 */
+        set_cpus_allowed(current, cpumask_of_cpu(0));
+        BUG_ON(smp_processor_id() != 0);
+        return x;
+}
+static inline void apm_restore_cpus(cpumask_t mask)
+{
+        set_cpus_allowed(current, mask);
+}
+#else
+/*
+ *      No CPU lockdown needed on a uniprocessor
+ */
+ 
+#define apm_save_cpus()         (current->cpus_allowed)
+#define apm_restore_cpus(x)     (void)(x)
+#endif
+/*
+ * These are the actual BIOS calls.  Depending on APM_ZERO_SEGS and
+ * apm_info.allow_ints, we are being really paranoid here!  Not only
+ * are interrupts disabled, but all the segment registers (except SS)
+ * are saved and zeroed this means that if the BIOS tries to reference
+ * any data without explicitly loading the segment registers, the kernel
+ * will fault immediately rather than have some unforeseen circumstances
+ * for the rest of the kernel.  And it will be very obvious!  :-) Doing
+ * this depends on CS referring to the same physical memory as DS so that
+ * DS can be zeroed before the call. Unfortunately, we can't do anything
+ * about the stack segment/pointer.  Also, we tell the compiler that
+ * everything could change.
+ *
+ * Also, we KNOW that for the non error case of apm_bios_call, there
+ * is no useful data returned in the low order 8 bits of eax.
+ */
+static inline unsigned long __apm_irq_save(void)
+{
+        unsigned long flags;
+        local_save_flags(flags);
+        if (apm_info.allow_ints) {
+                if (irqs_disabled_flags(flags))
+                        local_irq_enable();
+        } else
+                local_irq_disable();
+        return flags;
+}
+#define apm_irq_save(flags) \
+        do { flags = __apm_irq_save(); } while (0)
+static inline void apm_irq_restore(unsigned long flags)
+{
+        if (irqs_disabled_flags(flags))
+                local_irq_disable();
+        else if (irqs_disabled())
+                local_irq_enable();
+}
+#ifdef APM_ZERO_SEGS
+#       define APM_DECL_SEGS \
+                unsigned int saved_fs; unsigned int saved_gs;
+#       define APM_DO_SAVE_SEGS \
+                savesegment(fs, saved_fs); savesegment(gs, saved_gs)
+#       define APM_DO_RESTORE_SEGS \
+                loadsegment(fs, saved_fs); loadsegment(gs, saved_gs)
+#else
+#       define APM_DECL_SEGS
+#       define APM_DO_SAVE_SEGS
+#       define APM_DO_RESTORE_SEGS
+#endif
+/**
+ *      apm_bios_call   -       Make an APM BIOS 32bit call
+ *      @func: APM function to execute
+ *      @ebx_in: EBX register for call entry
+ *      @ecx_in: ECX register for call entry
+ *      @eax: EAX register return
+ *      @ebx: EBX register return
+ *      @ecx: ECX register return
+ *      @edx: EDX register return
+ *      @esi: ESI register return
+ *
+ *      Make an APM call using the 32bit protected mode interface. The
+ *      caller is responsible for knowing if APM BIOS is configured and
+ *      enabled. This call can disable interrupts for a long period of
+ *      time on some laptops.  The return value is in AH and the carry
+ *      flag is loaded into AL.  If there is an error, then the error
+ *      code is returned in AH (bits 8-15 of eax) and this function
+ *      returns non-zero.
+ */
+ 
+static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
+        u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
+{
+        APM_DECL_SEGS
+        unsigned long           flags;
+        cpumask_t               cpus;
+        int                     cpu;
+        struct desc_struct      save_desc_40;
+        struct desc_struct      *gdt;
+        cpus = apm_save_cpus();
+        
+        cpu = get_cpu();
+        gdt = get_cpu_gdt_table(cpu);
+        save_desc_40 = gdt[0x40 / 8];
+        gdt[0x40 / 8] = bad_bios_desc;
+        apm_irq_save(flags);
+        APM_DO_SAVE_SEGS;
+        apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi);
+        APM_DO_RESTORE_SEGS;
+        apm_irq_restore(flags);
+        gdt[0x40 / 8] = save_desc_40;
+        put_cpu();
+        apm_restore_cpus(cpus);
+        
+        return *eax & 0xff;
+}
+/**
+ *      apm_bios_call_simple    -       make a simple APM BIOS 32bit call
+ *      @func: APM function to invoke
+ *      @ebx_in: EBX register value for BIOS call
+ *      @ecx_in: ECX register value for BIOS call
+ *      @eax: EAX register on return from the BIOS call
+ *
+ *      Make a BIOS call that returns one value only, or just status.
+ *      If there is an error, then the error code is returned in AH
+ *      (bits 8-15 of eax) and this function returns non-zero. This is
+ *      used for simpler BIOS operations. This call may hold interrupts
+ *      off for a long time on some laptops.
+ */
+static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
+{
+        u8                      error;
+        APM_DECL_SEGS
+        unsigned long           flags;
+        cpumask_t               cpus;
+        int                     cpu;
+        struct desc_struct      save_desc_40;
+        struct desc_struct      *gdt;
+        cpus = apm_save_cpus();
+        
+        cpu = get_cpu();
+        gdt = get_cpu_gdt_table(cpu);
+        save_desc_40 = gdt[0x40 / 8];
+        gdt[0x40 / 8] = bad_bios_desc;
+        apm_irq_save(flags);
+        APM_DO_SAVE_SEGS;
+        error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax);
+        APM_DO_RESTORE_SEGS;
+        apm_irq_restore(flags);
+        gdt[0x40 / 8] = save_desc_40;
+        put_cpu();
+        apm_restore_cpus(cpus);
+        return error;
+}
+/**
+ *      apm_driver_version      -       APM driver version
+ *      @val:   loaded with the APM version on return
+ *
+ *      Retrieve the APM version supported by the BIOS. This is only
+ *      supported for APM 1.1 or higher. An error indicates APM 1.0 is
+ *      probably present.
+ *
+ *      On entry val should point to a value indicating the APM driver
+ *      version with the high byte being the major and the low byte the
+ *      minor number both in BCD
+ *
+ *      On return it will hold the BIOS revision supported in the
+ *      same format.
+ */
+static int apm_driver_version(u_short *val)
+{
+        u32     eax;
+        if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax))
+                return (eax >> 8) & 0xff;
+        *val = eax;
+        return APM_SUCCESS;
+}
+/**
+ *      apm_get_event   -       get an APM event from the BIOS
+ *      @event: pointer to the event
+ *      @info: point to the event information
+ *
+ *      The APM BIOS provides a polled information for event
+ *      reporting. The BIOS expects to be polled at least every second
+ *      when events are pending. When a message is found the caller should
+ *      poll until no more messages are present.  However, this causes
+ *      problems on some laptops where a suspend event notification is
+ *      not cleared until it is acknowledged.
+ *
+ *      Additional information is returned in the info pointer, providing
+ *      that APM 1.2 is in use. If no messges are pending the value 0x80
+ *      is returned (No power management events pending).
+ */
+ 
+static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
+{
+        u32     eax;
+        u32     ebx;
+        u32     ecx;
+        u32     dummy;
+        if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx,
+                        &dummy, &dummy))
+                return (eax >> 8) & 0xff;
+        *event = ebx;
+        if (apm_info.connection_version < 0x0102)
+                *info = ~0; /* indicate info not valid */
+        else
+                *info = ecx;
+        return APM_SUCCESS;
+}
+/**
+ *      set_power_state -       set the power management state
+ *      @what: which items to transition
+ *      @state: state to transition to
+ *
+ *      Request an APM change of state for one or more system devices. The
+ *      processor state must be transitioned last of all. what holds the
+ *      class of device in the upper byte and the device number (0xFF for
+ *      all) for the object to be transitioned.
+ *
+ *      The state holds the state to transition to, which may in fact
+ *      be an acceptance of a BIOS requested state change.
+ */
+ 
+static int set_power_state(u_short what, u_short state)
+{
+        u32     eax;
+        if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax))
+                return (eax >> 8) & 0xff;
+        return APM_SUCCESS;
+}
+/**
+ *      set_system_power_state - set system wide power state
+ *      @state: which state to enter
+ *
+ *      Transition the entire system into a new APM power state.
+ */
+ 
+static int set_system_power_state(u_short state)
+{
+        return set_power_state(APM_DEVICE_ALL, state);
+}
+/**
+ *      apm_do_idle     -       perform power saving
+ *
+ *      This function notifies the BIOS that the processor is (in the view
+ *      of the OS) idle. It returns -1 in the event that the BIOS refuses
+ *      to handle the idle request. On a success the function returns 1
+ *      if the BIOS did clock slowing or 0 otherwise.
+ */
+ 
+static int apm_do_idle(void)
+{
+        u32     eax;
+        u8      ret = 0;
+        int     idled = 0;
+        int     polling;
+        polling = !!(current_thread_info()->status & TS_POLLING);
+        if (polling) {
+                current_thread_info()->status &= ~TS_POLLING;
+                /*
+                 * TS_POLLING-cleared state must be visible before we
+                 * test NEED_RESCHED:
+                 */
+                smp_mb();
+        }
+        if (!need_resched()) {
+                idled = 1;
+                ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax);
+        }
+        if (polling)
+                current_thread_info()->status |= TS_POLLING;
+        if (!idled)
+                return 0;
+        if (ret) {
+                static unsigned long t;
+                /* This always fails on some SMP boards running UP kernels.
+                 * Only report the failure the first 5 times.
+                 */
+                if (++t < 5)
+                {
+                        printk(KERN_DEBUG "apm_do_idle failed (%d)\n",
+                                        (eax >> 8) & 0xff);
+                        t = jiffies;
+                }
+                return -1;
+        }
+        clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0;
+        return clock_slowed;
+}
+/**
+ *      apm_do_busy     -       inform the BIOS the CPU is busy
+ *
+ *      Request that the BIOS brings the CPU back to full performance. 
+ */
+ 
+static void apm_do_busy(void)
+{
+        u32     dummy;
+        if (clock_slowed || ALWAYS_CALL_BUSY) {
+                (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
+                clock_slowed = 0;
+        }
+}
+/*
+ * If no process has really been interested in
+ * the CPU for some time, we want to call BIOS
+ * power management - we probably want
+ * to conserve power.
+ */
+#define IDLE_CALC_LIMIT   (HZ * 100)
+#define IDLE_LEAKY_MAX    16
+static void (*original_pm_idle)(void) __read_mostly;
+/**
+ * apm_cpu_idle         -       cpu idling for APM capable Linux
+ *
+ * This is the idling function the kernel executes when APM is available. It 
+ * tries to do BIOS powermanagement based on the average system idle time.
+ * Furthermore it calls the system default idle routine.
+ */
+static void apm_cpu_idle(void)
+{
+        static int use_apm_idle; /* = 0 */
+        static unsigned int last_jiffies; /* = 0 */
+        static unsigned int last_stime; /* = 0 */
+        int apm_idle_done = 0;
+        unsigned int jiffies_since_last_check = jiffies - last_jiffies;
+        unsigned int bucket;
+recalc:
+        if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
+                use_apm_idle = 0;
+                last_jiffies = jiffies;
+                last_stime = current->stime;
+        } else if (jiffies_since_last_check > idle_period) {
+                unsigned int idle_percentage;
+                idle_percentage = current->stime - last_stime;
+                idle_percentage *= 100;
+                idle_percentage /= jiffies_since_last_check;
+                use_apm_idle = (idle_percentage > idle_threshold);
+                if (apm_info.forbid_idle)
+                        use_apm_idle = 0;
+                last_jiffies = jiffies;
+                last_stime = current->stime;
+        }
+        bucket = IDLE_LEAKY_MAX;
+        while (!need_resched()) {
+                if (use_apm_idle) {
+                        unsigned int t;
+                        t = jiffies;
+                        switch (apm_do_idle()) {
+                        case 0: apm_idle_done = 1;
+                                if (t != jiffies) {
+                                        if (bucket) {
+                                                bucket = IDLE_LEAKY_MAX;
+                                                continue;
+                                        }
+                                } else if (bucket) {
+                                        bucket--;
+                                        continue;
+                                }
+                                break;
+                        case 1: apm_idle_done = 1;
+                                break;
+                        default: /* BIOS refused */
+                                break;
+                        }
+                }
+                if (original_pm_idle)
+                        original_pm_idle();
+                else
+                        default_idle();
+                jiffies_since_last_check = jiffies - last_jiffies;
+                if (jiffies_since_last_check > idle_period)
+                        goto recalc;
+        }
+        if (apm_idle_done)
+                apm_do_busy();
+}
+/**
+ *      apm_power_off   -       ask the BIOS to power off
+ *
+ *      Handle the power off sequence. This is the one piece of code we
+ *      will execute even on SMP machines. In order to deal with BIOS
+ *      bugs we support real mode APM BIOS power off calls. We also make
+ *      the SMP call on CPU0 as some systems will only honour this call
+ *      on their first cpu.
+ */
+ 
+static void apm_power_off(void)
+{
+        unsigned char   po_bios_call[] = {
+                0xb8, 0x00, 0x10,       /* movw  $0x1000,ax  */
+                0x8e, 0xd0,             /* movw  ax,ss       */
+                0xbc, 0x00, 0xf0,       /* movw  $0xf000,sp  */
+                0xb8, 0x07, 0x53,       /* movw  $0x5307,ax  */
+                0xbb, 0x01, 0x00,       /* movw  $0x0001,bx  */
+                0xb9, 0x03, 0x00,       /* movw  $0x0003,cx  */
+                0xcd, 0x15              /* int   $0x15       */
+        };
+        /* Some bioses don't like being called from CPU != 0 */
+        if (apm_info.realmode_power_off)
+        {
+                (void)apm_save_cpus();
+                machine_real_restart(po_bios_call, sizeof(po_bios_call));
+        }
+        else
+                (void) set_system_power_state(APM_STATE_OFF);
+}
+#ifdef CONFIG_APM_DO_ENABLE
+/**
+ *      apm_enable_power_management - enable BIOS APM power management
+ *      @enable: enable yes/no
+ *
+ *      Enable or disable the APM BIOS power services. 
+ */
+ 
+static int apm_enable_power_management(int enable)
+{
+        u32     eax;
+        if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
+                return APM_NOT_ENGAGED;
+        if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
+                        enable, &eax))
+                return (eax >> 8) & 0xff;
+        if (enable)
+                apm_info.bios.flags &= ~APM_BIOS_DISABLED;
+        else
+                apm_info.bios.flags |= APM_BIOS_DISABLED;
+        return APM_SUCCESS;
+}
+#endif
+/**
+ *      apm_get_power_status    -       get current power state
+ *      @status: returned status
+ *      @bat: battery info
+ *      @life: estimated life
+ *
+ *      Obtain the current power status from the APM BIOS. We return a
+ *      status which gives the rough battery status, and current power
+ *      source. The bat value returned give an estimate as a percentage
+ *      of life and a status value for the battery. The estimated life
+ *      if reported is a lifetime in secodnds/minutes at current powwer
+ *      consumption.
+ */
+ 
+static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
+{
+        u32     eax;
+        u32     ebx;
+        u32     ecx;
+        u32     edx;
+        u32     dummy;
+        if (apm_info.get_power_status_broken)
+                return APM_32_UNSUPPORTED;
+        if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0,
+                        &eax, &ebx, &ecx, &edx, &dummy))
+                return (eax >> 8) & 0xff;
+        *status = ebx;
+        *bat = ecx;
+        if (apm_info.get_power_status_swabinminutes) {
+                *life = swab16((u16)edx);
+                *life |= 0x8000;
+        } else
+                *life = edx;
+        return APM_SUCCESS;
+}
+#if 0
+static int apm_get_battery_status(u_short which, u_short *status,
+                                  u_short *bat, u_short *life, u_short *nbat)
+{
+        u32     eax;
+        u32     ebx;
+        u32     ecx;
+        u32     edx;
+        u32     esi;
+        if (apm_info.connection_version < 0x0102) {
+                /* pretend we only have one battery. */
+                if (which != 1)
+                        return APM_BAD_DEVICE;
+                *nbat = 1;
+                return apm_get_power_status(status, bat, life);
+        }
+        if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
+                        &ebx, &ecx, &edx, &esi))
+                return (eax >> 8) & 0xff;
+        *status = ebx;
+        *bat = ecx;
+        *life = edx;
+        *nbat = esi;
+        return APM_SUCCESS;
+}
+#endif
+/**
+ *      apm_engage_power_management     -       enable PM on a device
+ *      @device: identity of device
+ *      @enable: on/off
+ *
+ *      Activate or deactive power management on either a specific device
+ *      or the entire system (%APM_DEVICE_ALL).
+ */
+ 
+static int apm_engage_power_management(u_short device, int enable)
+{
+        u32     eax;
+        if ((enable == 0) && (device == APM_DEVICE_ALL)
+            && (apm_info.bios.flags & APM_BIOS_DISABLED))
+                return APM_DISABLED;
+        if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax))
+                return (eax >> 8) & 0xff;
+        if (device == APM_DEVICE_ALL) {
+                if (enable)
+                        apm_info.bios.flags &= ~APM_BIOS_DISENGAGED;
+                else
+                        apm_info.bios.flags |= APM_BIOS_DISENGAGED;
+        }
+        return APM_SUCCESS;
+}
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+/**
+ *      apm_console_blank       -       blank the display
+ *      @blank: on/off
+ *
+ *      Attempt to blank the console, firstly by blanking just video device
+ *      zero, and if that fails (some BIOSes don't support it) then it blanks
+ *      all video devices. Typically the BIOS will do laptop backlight and
+ *      monitor powerdown for us.
+ */
+ 
+static int apm_console_blank(int blank)
+{
+        int error = APM_NOT_ENGAGED; /* silence gcc */
+        int i;
+        u_short state;
+        static const u_short dev[3] = { 0x100, 0x1FF, 0x101 };
+        state = blank ? APM_STATE_STANDBY : APM_STATE_READY;
+        for (i = 0; i < ARRAY_SIZE(dev); i++) {
+                error = set_power_state(dev[i], state);
+                if ((error == APM_SUCCESS) || (error == APM_NO_ERROR))
+                        return 1;
+                if (error == APM_NOT_ENGAGED)
+                        break;
+        }
+        if (error == APM_NOT_ENGAGED) {
+                static int tried;
+                int eng_error;
+                if (tried++ == 0) {
+                        eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1);
+                        if (eng_error) {
+                                apm_error("set display", error);
+                                apm_error("engage interface", eng_error);
+                                return 0;
+                        } else
+                                return apm_console_blank(blank);
+                }
+        }
+        apm_error("set display", error);
+        return 0;
+}
+#endif
+static int queue_empty(struct apm_user *as)
+{
+        return as->event_head == as->event_tail;
+}
+static apm_event_t get_queued_event(struct apm_user *as)
+{
+        if (++as->event_tail >= APM_MAX_EVENTS)
+                as->event_tail = 0;
+        return as->events[as->event_tail];
+}
+static void queue_event(apm_event_t event, struct apm_user *sender)
+{
+        struct apm_user *       as;
+        spin_lock(&user_list_lock);
+        if (user_list == NULL)
+                goto out;
+        for (as = user_list; as != NULL; as = as->next) {
+                if ((as == sender) || (!as->reader))
+                        continue;
+                if (++as->event_head >= APM_MAX_EVENTS)
+                        as->event_head = 0;
+                if (as->event_head == as->event_tail) {
+                        static int notified;
+                        if (notified++ == 0)
+                            printk(KERN_ERR "apm: an event queue overflowed\n");
+                        if (++as->event_tail >= APM_MAX_EVENTS)
+                                as->event_tail = 0;
+                }
+                as->events[as->event_head] = event;
+                if ((!as->suser) || (!as->writer))
+                        continue;
+                switch (event) {
+                case APM_SYS_SUSPEND:
+                case APM_USER_SUSPEND:
+                        as->suspends_pending++;
+                        suspends_pending++;
+                        break;
+                case APM_SYS_STANDBY:
+                case APM_USER_STANDBY:
+                        as->standbys_pending++;
+                        standbys_pending++;
+                        break;
+                }
+        }
+        wake_up_interruptible(&apm_waitqueue);
+out:
+        spin_unlock(&user_list_lock);
+}
+static void reinit_timer(void)
+{
+#ifdef INIT_TIMER_AFTER_SUSPEND
+        unsigned long flags;
+        spin_lock_irqsave(&i8253_lock, flags);
+        /* set the clock to HZ */
+        outb_p(0x34, PIT_MODE);         /* binary, mode 2, LSB/MSB, ch 0 */
+        udelay(10);
+        outb_p(LATCH & 0xff, PIT_CH0);  /* LSB */
+        udelay(10);
+        outb(LATCH >> 8, PIT_CH0);      /* MSB */
+        udelay(10);
+        spin_unlock_irqrestore(&i8253_lock, flags);
+#endif
+}
+static int suspend(int vetoable)
+{
+        int             err;
+        struct apm_user *as;
+        if (pm_send_all(PM_SUSPEND, (void *)3)) {
+                /* Vetoed */
+                if (vetoable) {
+                        if (apm_info.connection_version > 0x100)
+                                set_system_power_state(APM_STATE_REJECT);
+                        err = -EBUSY;
+                        ignore_sys_suspend = 0;
+                        printk(KERN_WARNING "apm: suspend was vetoed.\n");
+                        goto out;
+                }
+                printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n");
+        }
+        device_suspend(PMSG_SUSPEND);
+        local_irq_disable();
+        device_power_down(PMSG_SUSPEND);
+        local_irq_enable();
+        save_processor_state();
+        err = set_system_power_state(APM_STATE_SUSPEND);
+        ignore_normal_resume = 1;
+        restore_processor_state();
+        local_irq_disable();
+        reinit_timer();
+        if (err == APM_NO_ERROR)
+                err = APM_SUCCESS;
+        if (err != APM_SUCCESS)
+                apm_error("suspend", err);
+        err = (err == APM_SUCCESS) ? 0 : -EIO;
+        device_power_up();
+        local_irq_enable();
+        device_resume();
+        pm_send_all(PM_RESUME, (void *)0);
+        queue_event(APM_NORMAL_RESUME, NULL);
+ out:
+        spin_lock(&user_list_lock);
+        for (as = user_list; as != NULL; as = as->next) {
+                as->suspend_wait = 0;
+                as->suspend_result = err;
+        }
+        spin_unlock(&user_list_lock);
+        wake_up_interruptible(&apm_suspend_waitqueue);
+        return err;
+}
+static void standby(void)
+{
+        int     err;
+        local_irq_disable();
+        device_power_down(PMSG_SUSPEND);
+        local_irq_enable();
+        err = set_system_power_state(APM_STATE_STANDBY);
+        if ((err != APM_SUCCESS) && (err != APM_NO_ERROR))
+                apm_error("standby", err);
+        local_irq_disable();
+        device_power_up();
+        local_irq_enable();
+}
+static apm_event_t get_event(void)
+{
+        int             error;
+        apm_event_t     event = APM_NO_EVENTS; /* silence gcc */
+        apm_eventinfo_t info;
+        static int notified;
+        /* we don't use the eventinfo */
+        error = apm_get_event(&event, &info);
+        if (error == APM_SUCCESS)
+                return event;
+        if ((error != APM_NO_EVENTS) && (notified++ == 0))
+                apm_error("get_event", error);
+        return 0;
+}
+static void check_events(void)
+{
+        apm_event_t             event;
+        static unsigned long    last_resume;
+        static int              ignore_bounce;
+        while ((event = get_event()) != 0) {
+                if (debug) {
+                        if (event <= NR_APM_EVENT_NAME)
+                                printk(KERN_DEBUG "apm: received %s notify\n",
+                                       apm_event_name[event - 1]);
+                        else
+                                printk(KERN_DEBUG "apm: received unknown "
+                                       "event 0x%02x\n", event);
+                }
+                if (ignore_bounce
+                    && ((jiffies - last_resume) > bounce_interval))
+                        ignore_bounce = 0;
+                switch (event) {
+                case APM_SYS_STANDBY:
+                case APM_USER_STANDBY:
+                        queue_event(event, NULL);
+                        if (standbys_pending <= 0)
+                                standby();
+                        break;
+                case APM_USER_SUSPEND:
+#ifdef CONFIG_APM_IGNORE_USER_SUSPEND
+                        if (apm_info.connection_version > 0x100)
+                                set_system_power_state(APM_STATE_REJECT);
+                        break;
+#endif
+                case APM_SYS_SUSPEND:
+                        if (ignore_bounce) {
+                                if (apm_info.connection_version > 0x100)
+                                        set_system_power_state(APM_STATE_REJECT);
+                                break;
+                        }
+                        /*
+                         * If we are already processing a SUSPEND,
+                         * then further SUSPEND events from the BIOS
+                         * will be ignored.  We also return here to
+                         * cope with the fact that the Thinkpads keep
+                         * sending a SUSPEND event until something else
+                         * happens!
+                         */
+                        if (ignore_sys_suspend)
+                                return;
+                        ignore_sys_suspend = 1;
+                        queue_event(event, NULL);
+                        if (suspends_pending <= 0)
+                                (void) suspend(1);
+                        break;
+                case APM_NORMAL_RESUME:
+                case APM_CRITICAL_RESUME:
+                case APM_STANDBY_RESUME:
+                        ignore_sys_suspend = 0;
+                        last_resume = jiffies;
+                        ignore_bounce = 1;
+                        if ((event != APM_NORMAL_RESUME)
+                            || (ignore_normal_resume == 0)) {
+                                device_resume();
+                                pm_send_all(PM_RESUME, (void *)0);
+                                queue_event(event, NULL);
+                        }
+                        ignore_normal_resume = 0;
+                        break;
+                case APM_CAPABILITY_CHANGE:
+                case APM_LOW_BATTERY:
+                case APM_POWER_STATUS_CHANGE:
+                        queue_event(event, NULL);
+                        /* If needed, notify drivers here */
+                        break;
+                case APM_UPDATE_TIME:
+                        break;
+                case APM_CRITICAL_SUSPEND:
+                        /*
+                         * We are not allowed to reject a critical suspend.
+                         */
+                        (void) suspend(0);
+                        break;
+                }
+        }
+}
+static void apm_event_handler(void)
+{
+        static int      pending_count = 4;
+        int             err;
+        if ((standbys_pending > 0) || (suspends_pending > 0)) {
+                if ((apm_info.connection_version > 0x100) &&
+                                (pending_count-- <= 0)) {
+                        pending_count = 4;
+                        if (debug)
+                                printk(KERN_DEBUG "apm: setting state busy\n");
+                        err = set_system_power_state(APM_STATE_BUSY);
+                        if (err)
+                                apm_error("busy", err);
+                }
+        } else
+                pending_count = 4;
+        check_events();
+}
+/*
+ * This is the APM thread main loop.
+ */
+static void apm_mainloop(void)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        add_wait_queue(&apm_waitqueue, &wait);
+        set_current_state(TASK_INTERRUPTIBLE);
+        for (;;) {
+                schedule_timeout(APM_CHECK_TIMEOUT);
+                if (kthread_should_stop())
+                        break;
+                /*
+                 * Ok, check all events, check for idle (and mark us sleeping
+                 * so as not to count towards the load average)..
+                 */
+                set_current_state(TASK_INTERRUPTIBLE);
+                apm_event_handler();
+        }
+        remove_wait_queue(&apm_waitqueue, &wait);
+}
+static int check_apm_user(struct apm_user *as, const char *func)
+{
+        if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) {
+                printk(KERN_ERR "apm: %s passed bad filp\n", func);
+                return 1;
+        }
+        return 0;
+}
+static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
+{
+        struct apm_user *       as;
+        int                     i;
+        apm_event_t             event;
+        as = fp->private_data;
+        if (check_apm_user(as, "read"))
+                return -EIO;
+        if ((int)count < sizeof(apm_event_t))
+                return -EINVAL;
+        if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK))
+                return -EAGAIN;
+        wait_event_interruptible(apm_waitqueue, !queue_empty(as));
+        i = count;
+        while ((i >= sizeof(event)) && !queue_empty(as)) {
+                event = get_queued_event(as);
+                if (copy_to_user(buf, &event, sizeof(event))) {
+                        if (i < count)
+                                break;
+                        return -EFAULT;
+                }
+                switch (event) {
+                case APM_SYS_SUSPEND:
+                case APM_USER_SUSPEND:
+                        as->suspends_read++;
+                        break;
+                case APM_SYS_STANDBY:
+                case APM_USER_STANDBY:
+                        as->standbys_read++;
+                        break;
+                }
+                buf += sizeof(event);
+                i -= sizeof(event);
+        }
+        if (i < count)
+                return count - i;
+        if (signal_pending(current))
+                return -ERESTARTSYS;
+        return 0;
+}
+static unsigned int do_poll(struct file *fp, poll_table * wait)
+{
+        struct apm_user * as;
+        as = fp->private_data;
+        if (check_apm_user(as, "poll"))
+                return 0;
+        poll_wait(fp, &apm_waitqueue, wait);
+        if (!queue_empty(as))
+                return POLLIN | POLLRDNORM;
+        return 0;
+}
+static int do_ioctl(struct inode * inode, struct file *filp,
+                    u_int cmd, u_long arg)
+{
+        struct apm_user *       as;
+        as = filp->private_data;
+        if (check_apm_user(as, "ioctl"))
+                return -EIO;
+        if ((!as->suser) || (!as->writer))
+                return -EPERM;
+        switch (cmd) {
+        case APM_IOC_STANDBY:
+                if (as->standbys_read > 0) {
+                        as->standbys_read--;
+                        as->standbys_pending--;
+                        standbys_pending--;
+                } else
+                        queue_event(APM_USER_STANDBY, as);
+                if (standbys_pending <= 0)
+                        standby();
+                break;
+        case APM_IOC_SUSPEND:
+                if (as->suspends_read > 0) {
+                        as->suspends_read--;
+                        as->suspends_pending--;
+                        suspends_pending--;
+                } else
+                        queue_event(APM_USER_SUSPEND, as);
+                if (suspends_pending <= 0) {
+                        return suspend(1);
+                } else {
+                        as->suspend_wait = 1;
+                        wait_event_interruptible(apm_suspend_waitqueue,
+                                        as->suspend_wait == 0);
+                        return as->suspend_result;
+                }
+                break;
+        default:
+                return -EINVAL;
+        }
+        return 0;
+}
+static int do_release(struct inode * inode, struct file * filp)
+{
+        struct apm_user *       as;
+        as = filp->private_data;
+        if (check_apm_user(as, "release"))
+                return 0;
+        filp->private_data = NULL;
+        if (as->standbys_pending > 0) {
+                standbys_pending -= as->standbys_pending;
+                if (standbys_pending <= 0)
+                        standby();
+        }
+        if (as->suspends_pending > 0) {
+                suspends_pending -= as->suspends_pending;
+                if (suspends_pending <= 0)
+                        (void) suspend(1);
+        }
+        spin_lock(&user_list_lock);
+        if (user_list == as)
+                user_list = as->next;
+        else {
+                struct apm_user *       as1;
+                for (as1 = user_list;
+                     (as1 != NULL) && (as1->next != as);
+                     as1 = as1->next)
+                        ;
+                if (as1 == NULL)
+                        printk(KERN_ERR "apm: filp not in user list\n");
+                else
+                        as1->next = as->next;
+        }
+        spin_unlock(&user_list_lock);
+        kfree(as);
+        return 0;
+}
+static int do_open(struct inode * inode, struct file * filp)
+{
+        struct apm_user *       as;
+        as = kmalloc(sizeof(*as), GFP_KERNEL);
+        if (as == NULL) {
+                printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
+                       sizeof(*as));
+                return -ENOMEM;
+        }
+        as->magic = APM_BIOS_MAGIC;
+        as->event_tail = as->event_head = 0;
+        as->suspends_pending = as->standbys_pending = 0;
+        as->suspends_read = as->standbys_read = 0;
+        /*
+         * XXX - this is a tiny bit broken, when we consider BSD
+         * process accounting. If the device is opened by root, we
+         * instantly flag that we used superuser privs. Who knows,
+         * we might close the device immediately without doing a
+         * privileged operation -- cevans
+         */
+        as->suser = capable(CAP_SYS_ADMIN);
+        as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE;
+        as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ;
+        spin_lock(&user_list_lock);
+        as->next = user_list;
+        user_list = as;
+        spin_unlock(&user_list_lock);
+        filp->private_data = as;
+        return 0;
+}
+static int proc_apm_show(struct seq_file *m, void *v)
+{
+        unsigned short  bx;
+        unsigned short  cx;
+        unsigned short  dx;
+        int             error;
+        unsigned short  ac_line_status = 0xff;
+        unsigned short  battery_status = 0xff;
+        unsigned short  battery_flag   = 0xff;
+        int             percentage     = -1;
+        int             time_units     = -1;
+        char            *units         = "?";
+        if ((num_online_cpus() == 1) &&
+            !(error = apm_get_power_status(&bx, &cx, &dx))) {
+                ac_line_status = (bx >> 8) & 0xff;
+                battery_status = bx & 0xff;
+                if ((cx & 0xff) != 0xff)
+                        percentage = cx & 0xff;
+                if (apm_info.connection_version > 0x100) {
+                        battery_flag = (cx >> 8) & 0xff;
+                        if (dx != 0xffff) {
+                                units = (dx & 0x8000) ? "min" : "sec";
+                                time_units = dx & 0x7fff;
+                        }
+                }
+        }
+        /* Arguments, with symbols from linux/apm_bios.h.  Information is
+           from the Get Power Status (0x0a) call unless otherwise noted.
+           0) Linux driver version (this will change if format changes)
+           1) APM BIOS Version.  Usually 1.0, 1.1 or 1.2.
+           2) APM flags from APM Installation Check (0x00):
+              bit 0: APM_16_BIT_SUPPORT
+              bit 1: APM_32_BIT_SUPPORT
+              bit 2: APM_IDLE_SLOWS_CLOCK
+              bit 3: APM_BIOS_DISABLED
+              bit 4: APM_BIOS_DISENGAGED
+           3) AC line status
+              0x00: Off-line
+              0x01: On-line
+              0x02: On backup power (BIOS >= 1.1 only)
+              0xff: Unknown
+           4) Battery status
+              0x00: High
+              0x01: Low
+              0x02: Critical
+              0x03: Charging
+              0x04: Selected battery not present (BIOS >= 1.2 only)
+              0xff: Unknown
+           5) Battery flag
+              bit 0: High
+              bit 1: Low
+              bit 2: Critical
+              bit 3: Charging
+              bit 7: No system battery
+              0xff: Unknown
+           6) Remaining battery life (percentage of charge):
+              0-100: valid
+              -1: Unknown
+           7) Remaining battery life (time units):
+              Number of remaining minutes or seconds
+              -1: Unknown
+           8) min = minutes; sec = seconds */
+        seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
+                     driver_version,
+                     (apm_info.bios.version >> 8) & 0xff,
+                     apm_info.bios.version & 0xff,
+                     apm_info.bios.flags,
+                     ac_line_status,
+                     battery_status,
+                     battery_flag,
+                     percentage,
+                     time_units,
+                     units);
+        return 0;
+}
+static int proc_apm_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, proc_apm_show, NULL);
+}
+static const struct file_operations apm_file_ops = {
+        .owner          = THIS_MODULE,
+        .open           = proc_apm_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int apm(void *unused)
+{
+        unsigned short  bx;
+        unsigned short  cx;
+        unsigned short  dx;
+        int             error;
+        char *          power_stat;
+        char *          bat_stat;
+#ifdef CONFIG_SMP
+        /* 2002/08/01 - WT
+         * This is to avoid random crashes at boot time during initialization
+         * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D.
+         * Some bioses don't like being called from CPU != 0.
+         * Method suggested by Ingo Molnar.
+         */
+        set_cpus_allowed(current, cpumask_of_cpu(0));
+        BUG_ON(smp_processor_id() != 0);
+#endif
+        if (apm_info.connection_version == 0) {
+                apm_info.connection_version = apm_info.bios.version;
+                if (apm_info.connection_version > 0x100) {
+                        /*
+                         * We only support BIOSs up to version 1.2
+                         */
+                        if (apm_info.connection_version > 0x0102)
+                                apm_info.connection_version = 0x0102;
+                        error = apm_driver_version(&apm_info.connection_version);
+                        if (error != APM_SUCCESS) {
+                                apm_error("driver version", error);
+                                /* Fall back to an APM 1.0 connection. */
+                                apm_info.connection_version = 0x100;
+                        }
+                }
+        }
+        if (debug)
+                printk(KERN_INFO "apm: Connection version %d.%d\n",
+                        (apm_info.connection_version >> 8) & 0xff,
+                        apm_info.connection_version & 0xff);
+#ifdef CONFIG_APM_DO_ENABLE
+        if (apm_info.bios.flags & APM_BIOS_DISABLED) {
+                /*
+                 * This call causes my NEC UltraLite Versa 33/C to hang if it
+                 * is booted with PM disabled but not in the docking station.
+                 * Unfortunate ...
+                 */
+                error = apm_enable_power_management(1);
+                if (error) {
+                        apm_error("enable power management", error);
+                        return -1;
+                }
+        }
+#endif
+        if ((apm_info.bios.flags & APM_BIOS_DISENGAGED)
+            && (apm_info.connection_version > 0x0100)) {
+                error = apm_engage_power_management(APM_DEVICE_ALL, 1);
+                if (error) {
+                        apm_error("engage power management", error);
+                        return -1;
+                }
+        }
+        if (debug && (num_online_cpus() == 1 || smp )) {
+                error = apm_get_power_status(&bx, &cx, &dx);
+                if (error)
+                        printk(KERN_INFO "apm: power status not available\n");
+                else {
+                        switch ((bx >> 8) & 0xff) {
+                        case 0: power_stat = "off line"; break;
+                        case 1: power_stat = "on line"; break;
+                        case 2: power_stat = "on backup power"; break;
+                        default: power_stat = "unknown"; break;
+                        }
+                        switch (bx & 0xff) {
+                        case 0: bat_stat = "high"; break;
+                        case 1: bat_stat = "low"; break;
+                        case 2: bat_stat = "critical"; break;
+                        case 3: bat_stat = "charging"; break;
+                        default: bat_stat = "unknown"; break;
+                        }
+                        printk(KERN_INFO
+                               "apm: AC %s, battery status %s, battery life ",
+                               power_stat, bat_stat);
+                        if ((cx & 0xff) == 0xff)
+                                printk("unknown\n");
+                        else
+                                printk("%d%%\n", cx & 0xff);
+                        if (apm_info.connection_version > 0x100) {
+                                printk(KERN_INFO
+                                       "apm: battery flag 0x%02x, battery life ",
+                                       (cx >> 8) & 0xff);
+                                if (dx == 0xffff)
+                                        printk("unknown\n");
+                                else
+                                        printk("%d %s\n", dx & 0x7fff,
+                                                (dx & 0x8000) ?
+                                                "minutes" : "seconds");
+                        }
+                }
+        }
+        /* Install our power off handler.. */
+        if (power_off)
+                pm_power_off = apm_power_off;
+        if (num_online_cpus() == 1 || smp) {
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+                console_blank_hook = apm_console_blank;
+#endif
+                apm_mainloop();
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+                console_blank_hook = NULL;
+#endif
+        }
+        return 0;
+}
+#ifndef MODULE
+static int __init apm_setup(char *str)
+{
+        int     invert;
+        while ((str != NULL) && (*str != '\0')) {
+                if (strncmp(str, "off", 3) == 0)
+                        apm_disabled = 1;
+                if (strncmp(str, "on", 2) == 0)
+                        apm_disabled = 0;
+                if ((strncmp(str, "bounce-interval=", 16) == 0) ||
+                    (strncmp(str, "bounce_interval=", 16) == 0))
+                        bounce_interval = simple_strtol(str + 16, NULL, 0);
+                if ((strncmp(str, "idle-threshold=", 15) == 0) ||
+                    (strncmp(str, "idle_threshold=", 15) == 0))
+                        idle_threshold = simple_strtol(str + 15, NULL, 0);
+                if ((strncmp(str, "idle-period=", 12) == 0) ||
+                    (strncmp(str, "idle_period=", 12) == 0))
+                        idle_period = simple_strtol(str + 12, NULL, 0);
+                invert = (strncmp(str, "no-", 3) == 0) ||
+                        (strncmp(str, "no_", 3) == 0);
+                if (invert)
+                        str += 3;
+                if (strncmp(str, "debug", 5) == 0)
+                        debug = !invert;
+                if ((strncmp(str, "power-off", 9) == 0) ||
+                    (strncmp(str, "power_off", 9) == 0))
+                        power_off = !invert;
+                if (strncmp(str, "smp", 3) == 0)
+                {
+                        smp = !invert;
+                        idle_threshold = 100;
+                }
+                if ((strncmp(str, "allow-ints", 10) == 0) ||
+                    (strncmp(str, "allow_ints", 10) == 0))
+                        apm_info.allow_ints = !invert;
+                if ((strncmp(str, "broken-psr", 10) == 0) ||
+                    (strncmp(str, "broken_psr", 10) == 0))
+                        apm_info.get_power_status_broken = !invert;
+                if ((strncmp(str, "realmode-power-off", 18) == 0) ||
+                    (strncmp(str, "realmode_power_off", 18) == 0))
+                        apm_info.realmode_power_off = !invert;
+                str = strchr(str, ',');
+                if (str != NULL)
+                        str += strspn(str, ", \t");
+        }
+        return 1;
+}
+__setup("apm=", apm_setup);
+#endif
+static const struct file_operations apm_bios_fops = {
+        .owner          = THIS_MODULE,
+        .read           = do_read,
+        .poll           = do_poll,
+        .ioctl          = do_ioctl,
+        .open           = do_open,
+        .release        = do_release,
+};
+static struct miscdevice apm_device = {
+        APM_MINOR_DEV,
+        "apm_bios",
+        &apm_bios_fops
+};
+/* Simple "print if true" callback */
+static int __init print_if_true(const struct dmi_system_id *d)
+{
+        printk("%s\n", d->ident);
+        return 0;
+}
+/*
+ * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was
+ * disabled before the suspend. Linux used to get terribly confused by that.
+ */
+static int __init broken_ps2_resume(const struct dmi_system_id *d)
+{
+        printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident);
+        return 0;
+}
+/* Some bioses have a broken protected mode poweroff and need to use realmode */
+static int __init set_realmode_power_off(const struct dmi_system_id *d)
+{
+        if (apm_info.realmode_power_off == 0) {
+                apm_info.realmode_power_off = 1;
+                printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident);
+        }
+        return 0;
+}
+/* Some laptops require interrupts to be enabled during APM calls */
+static int __init set_apm_ints(const struct dmi_system_id *d)
+{
+        if (apm_info.allow_ints == 0) {
+                apm_info.allow_ints = 1;
+                printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident);
+        }
+        return 0;
+}
+/* Some APM bioses corrupt memory or just plain do not work */
+static int __init apm_is_horked(const struct dmi_system_id *d)
+{
+        if (apm_info.disabled == 0) {
+                apm_info.disabled = 1;
+                printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
+        }
+        return 0;
+}
+static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
+{
+        if (apm_info.disabled == 0) {
+                apm_info.disabled = 1;
+                printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
+                printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
+                printk(KERN_INFO "download from support.intel.com \n");
+        }
+        return 0;
+}
+/* Some APM bioses hang on APM idle calls */
+static int __init apm_likes_to_melt(const struct dmi_system_id *d)
+{
+        if (apm_info.forbid_idle == 0) {
+                apm_info.forbid_idle = 1;
+                printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident);
+        }
+        return 0;
+}
+/*
+ *  Check for clue free BIOS implementations who use
+ *  the following QA technique
+ *
+ *      [ Write BIOS Code ]<------
+ *               |                ^
+ *      < Does it Compile >----N--
+ *               |Y               ^
+ *      < Does it Boot Win98 >-N--
+ *               |Y
+ *           [Ship It]
+ *
+ *      Phoenix A04  08/24/2000 is known bad (Dell Inspiron 5000e)
+ *      Phoenix A07  09/29/2000 is known good (Dell Inspiron 5000)
+ */
+static int __init broken_apm_power(const struct dmi_system_id *d)
+{
+        apm_info.get_power_status_broken = 1;
+        printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n");
+        return 0;
+}
+/*
+ * This bios swaps the APM minute reporting bytes over (Many sony laptops
+ * have this problem).
+ */
+static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d)
+{
+        apm_info.get_power_status_swabinminutes = 1;
+        printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n");
+        return 0;
+}
+static struct dmi_system_id __initdata apm_dmi_table[] = {
+        {
+                print_if_true,
+                KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.",
+                {       DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), },
+        },
+        {       /* Handle problems with APM on the C600 */
+                broken_ps2_resume, "Dell Latitude C600",
+                {       DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), },
+        },
+        {       /* Allow interrupts during suspend on Dell Latitude laptops*/
+                set_apm_ints, "Dell Latitude",
+                {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), }
+        },
+        {       /* APM crashes */
+                apm_is_horked, "Dell Inspiron 2500",
+                {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
+                        DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+        },
+        {       /* Allow interrupts during suspend on Dell Inspiron laptops*/
+                set_apm_ints, "Dell Inspiron", {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), },
+        },
+        {       /* Handle problems with APM on Inspiron 5000e */
+                broken_apm_power, "Dell Inspiron 5000e",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "A04"),
+                        DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), },
+        },
+        {       /* Handle problems with APM on Inspiron 2500 */
+                broken_apm_power, "Dell Inspiron 2500",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "A12"),
+                        DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), },
+        },
+        {       /* APM crashes */
+                apm_is_horked, "Dell Dimension 4100",
+                {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
+                        DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."),
+                        DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+        },
+        {       /* Allow interrupts during suspend on Compaq Laptops*/
+                set_apm_ints, "Compaq 12XL125",
+                {       DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
+                        DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION,"4.06"), },
+        },
+        {       /* Allow interrupts during APM or the clock goes slow */
+                set_apm_ints, "ASUSTeK",
+                {       DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), },
+        },
+        {       /* APM blows on shutdown */
+                apm_is_horked, "ABIT KX7-333[R]",
+                {       DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"),
+                        DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), },
+        },
+        {       /* APM crashes */
+                apm_is_horked, "Trigem Delhi3",
+                {       DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), },
+        },
+        {       /* APM crashes */
+                apm_is_horked, "Fujitsu-Siemens",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), },
+        },
+        {       /* APM crashes */
+                apm_is_horked_d850md, "Intel D850MD",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+                        DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), },
+        },
+        {       /* APM crashes */
+                apm_is_horked, "Intel D810EMO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+                        DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), },
+        },
+        {       /* APM crashes */
+                apm_is_horked, "Dell XPS-Z",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+                        DMI_MATCH(DMI_BIOS_VERSION, "A11"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), },
+        },
+        {       /* APM crashes */
+                apm_is_horked, "Sharp PC-PJ/AX",
+                {       DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
+                        DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"),
+                        DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), },
+        },
+        {       /* APM crashes */
+                apm_is_horked, "Dell Inspiron 2500",
+                {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
+                        DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+        },
+        {       /* APM idle hangs */
+                apm_likes_to_melt, "Jabil AMD",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+                        DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), },
+        },
+        {       /* APM idle hangs */
+                apm_likes_to_melt, "AMI Bios",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+                        DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), },
+        },
+        {       /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */
+                swab_apm_power_in_minutes, "Sony VAIO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "R0206H"),
+                        DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), },
+        },
+        {       /* Handle problems with APM on Sony Vaio PCG-N505VX */
+                swab_apm_power_in_minutes, "Sony VAIO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"),
+                        DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), },
+        },
+        {       /* Handle problems with APM on Sony Vaio PCG-XG29 */
+                swab_apm_power_in_minutes, "Sony VAIO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"),
+                        DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), },
+        },
+        {       /* Handle problems with APM on Sony Vaio PCG-Z600NE */
+                swab_apm_power_in_minutes, "Sony VAIO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"),
+                        DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), },
+        },
+        {       /* Handle problems with APM on Sony Vaio PCG-Z600NE */
+                swab_apm_power_in_minutes, "Sony VAIO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"),
+                        DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), },
+        },
+        {       /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */
+                swab_apm_power_in_minutes, "Sony VAIO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"),
+                        DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), },
+        },
+        {       /* Handle problems with APM on Sony Vaio PCG-Z505LS */
+                swab_apm_power_in_minutes, "Sony VAIO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"),
+                        DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), },
+        },
+        {       /* Handle problems with APM on Sony Vaio PCG-Z505LS */
+                swab_apm_power_in_minutes, "Sony VAIO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"),
+                        DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), },
+        },
+        {       /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */
+                swab_apm_power_in_minutes, "Sony VAIO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"),
+                        DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), },
+        },
+        {       /* Handle problems with APM on Sony Vaio PCG-F104K */
+                swab_apm_power_in_minutes, "Sony VAIO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"),
+                        DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), },
+        },
+        {       /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */
+                swab_apm_power_in_minutes, "Sony VAIO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"),
+                        DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), },
+        },
+        {       /* Handle problems with APM on Sony Vaio PCG-C1VE */
+                swab_apm_power_in_minutes, "Sony VAIO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"),
+                        DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), },
+        },
+        {       /* Handle problems with APM on Sony Vaio PCG-C1VE */
+                swab_apm_power_in_minutes, "Sony VAIO",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"),
+                        DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), },
+        },
+        {       /* broken PM poweroff bios */
+                set_realmode_power_off, "Award Software v4.60 PGMA",
+                {       DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."),
+                        DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
+                        DMI_MATCH(DMI_BIOS_DATE, "134526184"), },
+        },
+        /* Generic per vendor APM settings  */
+        {       /* Allow interrupts during suspend on IBM laptops */
+                set_apm_ints, "IBM",
+                {       DMI_MATCH(DMI_SYS_VENDOR, "IBM"), },
+        },
+        { }
+};
+/*
+ * Just start the APM thread. We do NOT want to do APM BIOS
+ * calls from anything but the APM thread, if for no other reason
+ * than the fact that we don't trust the APM BIOS. This way,
+ * most common APM BIOS problems that lead to protection errors
+ * etc will have at least some level of being contained...
+ *
+ * In short, if something bad happens, at least we have a choice
+ * of just killing the apm thread..
+ */
+static int __init apm_init(void)
+{
+        struct proc_dir_entry *apm_proc;
+        struct desc_struct *gdt;
+        int err;
+        dmi_check_system(apm_dmi_table);
+        if (apm_info.bios.version == 0 || paravirt_enabled()) {
+                printk(KERN_INFO "apm: BIOS not found.\n");
+                return -ENODEV;
+        }
+        printk(KERN_INFO
+                "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
+                ((apm_info.bios.version >> 8) & 0xff),
+                (apm_info.bios.version & 0xff),
+                apm_info.bios.flags,
+                driver_version);
+        if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
+                printk(KERN_INFO "apm: no 32 bit BIOS support\n");
+                return -ENODEV;
+        }
+        if (allow_ints)
+                apm_info.allow_ints = 1;
+        if (broken_psr)
+                apm_info.get_power_status_broken = 1;
+        if (realmode_power_off)
+                apm_info.realmode_power_off = 1;
+        /* User can override, but default is to trust DMI */
+        if (apm_disabled != -1)
+                apm_info.disabled = apm_disabled;
+        /*
+         * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1
+         * but is reportedly a 1.0 BIOS.
+         */
+        if (apm_info.bios.version == 0x001)
+                apm_info.bios.version = 0x100;
+        /* BIOS < 1.2 doesn't set cseg_16_len */
+        if (apm_info.bios.version < 0x102)
+                apm_info.bios.cseg_16_len = 0; /* 64k */
+        if (debug) {
+                printk(KERN_INFO "apm: entry %x:%x cseg16 %x dseg %x",
+                        apm_info.bios.cseg, apm_info.bios.offset,
+                        apm_info.bios.cseg_16, apm_info.bios.dseg);
+                if (apm_info.bios.version > 0x100)
+                        printk(" cseg len %x, dseg len %x",
+                                apm_info.bios.cseg_len,
+                                apm_info.bios.dseg_len);
+                if (apm_info.bios.version > 0x101)
+                        printk(" cseg16 len %x", apm_info.bios.cseg_16_len);
+                printk("\n");
+        }
+        if (apm_info.disabled) {
+                printk(KERN_NOTICE "apm: disabled on user request.\n");
+                return -ENODEV;
+        }
+        if ((num_online_cpus() > 1) && !power_off && !smp) {
+                printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n");
+                apm_info.disabled = 1;
+                return -ENODEV;
+        }
+        if (PM_IS_ACTIVE()) {
+                printk(KERN_NOTICE "apm: overridden by ACPI.\n");
+                apm_info.disabled = 1;
+                return -ENODEV;
+        }
+#ifdef CONFIG_PM_LEGACY
+        pm_active = 1;
+#endif
+        /*
+         * Set up a segment that references the real mode segment 0x40
+         * that extends up to the end of page zero (that we have reserved).
+         * This is for buggy BIOS's that refer to (real mode) segment 0x40
+         * even though they are called in protected mode.
+         */
+        set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
+        _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
+        /*
+         * Set up the long jump entry point to the APM BIOS, which is called
+         * from inline assembly.
+         */
+        apm_bios_entry.offset = apm_info.bios.offset;
+        apm_bios_entry.segment = APM_CS;
+        /*
+         * The APM 1.1 BIOS is supposed to provide limit information that it
+         * recognizes.  Many machines do this correctly, but many others do
+         * not restrict themselves to their claimed limit.  When this happens,
+         * they will cause a segmentation violation in the kernel at boot time.
+         * Most BIOS's, however, will respect a 64k limit, so we use that.
+         *
+         * Note we only set APM segments on CPU zero, since we pin the APM
+         * code to that CPU.
+         */
+        gdt = get_cpu_gdt_table(0);
+        set_base(gdt[APM_CS >> 3],
+                 __va((unsigned long)apm_info.bios.cseg << 4));
+        set_base(gdt[APM_CS_16 >> 3],
+                 __va((unsigned long)apm_info.bios.cseg_16 << 4));
+        set_base(gdt[APM_DS >> 3],
+                 __va((unsigned long)apm_info.bios.dseg << 4));
+        apm_proc = create_proc_entry("apm", 0, NULL);
+        if (apm_proc)
+                apm_proc->proc_fops = &apm_file_ops;
+        kapmd_task = kthread_create(apm, NULL, "kapmd");
+        if (IS_ERR(kapmd_task)) {
+                printk(KERN_ERR "apm: disabled - Unable to start kernel "
+                                "thread.\n");
+                err = PTR_ERR(kapmd_task);
+                kapmd_task = NULL;
+                remove_proc_entry("apm", NULL);
+                return err;
+        }
+        wake_up_process(kapmd_task);
+        if (num_online_cpus() > 1 && !smp ) {
+                printk(KERN_NOTICE
+                   "apm: disabled - APM is not SMP safe (power off active).\n");
+                return 0;
+        }
+        /*
+         * Note we don't actually care if the misc_device cannot be registered.
+         * this driver can do its job without it, even if userspace can't
+         * control it.  just log the error
+         */
+        if (misc_register(&apm_device))
+                printk(KERN_WARNING "apm: Could not register misc device.\n");
+        if (HZ != 100)
+                idle_period = (idle_period * HZ) / 100;
+        if (idle_threshold < 100) {
+                original_pm_idle = pm_idle;
+                pm_idle  = apm_cpu_idle;
+                set_pm_idle = 1;
+        }
+        return 0;
+}
+static void __exit apm_exit(void)
+{
+        int     error;
+        if (set_pm_idle) {
+                pm_idle = original_pm_idle;
+                /*
+                 * We are about to unload the current idle thread pm callback
+                 * (pm_idle), Wait for all processors to update cached/local
+                 * copies of pm_idle before proceeding.
+                 */
+                cpu_idle_wait();
+        }
+        if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
+            && (apm_info.connection_version > 0x0100)) {
+                error = apm_engage_power_management(APM_DEVICE_ALL, 0);
+                if (error)
+                        apm_error("disengage power management", error);
+        }
+        misc_deregister(&apm_device);
+        remove_proc_entry("apm", NULL);
+        if (power_off)
+                pm_power_off = NULL;
+        if (kapmd_task) {
+                kthread_stop(kapmd_task);
+                kapmd_task = NULL;
+        }
+#ifdef CONFIG_PM_LEGACY
+        pm_active = 0;
+#endif
+}
+module_init(apm_init);
+module_exit(apm_exit);
+MODULE_AUTHOR("Stephen Rothwell");
+MODULE_DESCRIPTION("Advanced Power Management");
+MODULE_LICENSE("GPL");
+module_param(debug, bool, 0644);
+MODULE_PARM_DESC(debug, "Enable debug mode");
+module_param(power_off, bool, 0444);
+MODULE_PARM_DESC(power_off, "Enable power off");
+module_param(bounce_interval, int, 0444);
+MODULE_PARM_DESC(bounce_interval,
+                "Set the number of ticks to ignore suspend bounces");
+module_param(allow_ints, bool, 0444);
+MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls");
+module_param(broken_psr, bool, 0444);
+MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call");
+module_param(realmode_power_off, bool, 0444);
+MODULE_PARM_DESC(realmode_power_off,
+                "Switch to real mode before powering off");
+module_param(idle_threshold, int, 0444);
+MODULE_PARM_DESC(idle_threshold,
+        "System idle percentage above which to make APM BIOS idle calls");
+module_param(idle_period, int, 0444);
+MODULE_PARM_DESC(idle_period,
+        "Period (in sec/100) over which to caculate the idle percentage");
+module_param(smp, bool, 0444);
+MODULE_PARM_DESC(smp,
+        "Set this to enable APM use on an SMP platform. Use with caution on older systems");
+MODULE_ALIAS_MISCDEV(APM_MINOR_DEV);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
new file mode 100644
index 000000000000..cfa82c899f47
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets.c
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "asm-offsets_32.c"
+#else
+# include "asm-offsets_64.c"
+#endif
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
new file mode 100644
index 000000000000..8029742c0fc1
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -0,0 +1,147 @@
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed
+ * to extract and format the required data.
+ */
+#include <linux/crypto.h>
+#include <linux/sched.h>
+#include <linux/signal.h>
+#include <linux/personality.h>
+#include <linux/suspend.h>
+#include <asm/ucontext.h>
+#include "sigframe_32.h"
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/thread_info.h>
+#include <asm/elf.h>
+#include <xen/interface/xen.h>
+#ifdef CONFIG_LGUEST_GUEST
+#include <linux/lguest.h>
+#include "../../../drivers/lguest/lg.h"
+#endif
+#define DEFINE(sym, val) \
+        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+#define BLANK() asm volatile("\n->" : : )
+#define OFFSET(sym, str, mem) \
+        DEFINE(sym, offsetof(struct str, mem));
+/* workaround for a warning with -Wmissing-prototypes */
+void foo(void);
+void foo(void)
+{
+        OFFSET(SIGCONTEXT_eax, sigcontext, eax);
+        OFFSET(SIGCONTEXT_ebx, sigcontext, ebx);
+        OFFSET(SIGCONTEXT_ecx, sigcontext, ecx);
+        OFFSET(SIGCONTEXT_edx, sigcontext, edx);
+        OFFSET(SIGCONTEXT_esi, sigcontext, esi);
+        OFFSET(SIGCONTEXT_edi, sigcontext, edi);
+        OFFSET(SIGCONTEXT_ebp, sigcontext, ebp);
+        OFFSET(SIGCONTEXT_esp, sigcontext, esp);
+        OFFSET(SIGCONTEXT_eip, sigcontext, eip);
+        BLANK();
+        OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
+        OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
+        OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
+        OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
+        OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math);
+        OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
+        OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
+        OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
+        BLANK();
+        OFFSET(TI_task, thread_info, task);
+        OFFSET(TI_exec_domain, thread_info, exec_domain);
+        OFFSET(TI_flags, thread_info, flags);
+        OFFSET(TI_status, thread_info, status);
+        OFFSET(TI_preempt_count, thread_info, preempt_count);
+        OFFSET(TI_addr_limit, thread_info, addr_limit);
+        OFFSET(TI_restart_block, thread_info, restart_block);
+        OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+        OFFSET(TI_cpu, thread_info, cpu);
+        BLANK();
+        OFFSET(GDS_size, Xgt_desc_struct, size);
+        OFFSET(GDS_address, Xgt_desc_struct, address);
+        OFFSET(GDS_pad, Xgt_desc_struct, pad);
+        BLANK();
+        OFFSET(PT_EBX, pt_regs, ebx);
+        OFFSET(PT_ECX, pt_regs, ecx);
+        OFFSET(PT_EDX, pt_regs, edx);
+        OFFSET(PT_ESI, pt_regs, esi);
+        OFFSET(PT_EDI, pt_regs, edi);
+        OFFSET(PT_EBP, pt_regs, ebp);
+        OFFSET(PT_EAX, pt_regs, eax);
+        OFFSET(PT_DS,  pt_regs, xds);
+        OFFSET(PT_ES,  pt_regs, xes);
+        OFFSET(PT_FS,  pt_regs, xfs);
+        OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
+        OFFSET(PT_EIP, pt_regs, eip);
+        OFFSET(PT_CS,  pt_regs, xcs);
+        OFFSET(PT_EFLAGS, pt_regs, eflags);
+        OFFSET(PT_OLDESP, pt_regs, esp);
+        OFFSET(PT_OLDSS,  pt_regs, xss);
+        BLANK();
+        OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
+        OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
+        BLANK();
+        OFFSET(pbe_address, pbe, address);
+        OFFSET(pbe_orig_address, pbe, orig_address);
+        OFFSET(pbe_next, pbe, next);
+        /* Offset from the sysenter stack to tss.esp0 */
+        DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) -
+                 sizeof(struct tss_struct));
+        DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
+        DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
+        DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
+        DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
+        DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
+        DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
+        OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+#ifdef CONFIG_PARAVIRT
+        BLANK();
+        OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
+        OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
+        OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+        OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
+        OFFSET(PARAVIRT_iret, paravirt_ops, iret);
+        OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+#endif
+#ifdef CONFIG_XEN
+        BLANK();
+        OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+        OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#endif
+#ifdef CONFIG_LGUEST_GUEST
+        BLANK();
+        OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+        OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
+        OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
+        OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
+        OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
+        OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
+        OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
+        OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
+        OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
+        OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
+        OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
+#endif
+}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
new file mode 100644
index 000000000000..778953bc636c
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -0,0 +1,85 @@
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#include <linux/crypto.h>
+#include <linux/sched.h> 
+#include <linux/stddef.h>
+#include <linux/errno.h> 
+#include <linux/hardirq.h>
+#include <linux/suspend.h>
+#include <asm/pda.h>
+#include <asm/processor.h>
+#include <asm/segment.h>
+#include <asm/thread_info.h>
+#include <asm/ia32.h>
+#define DEFINE(sym, val) \
+        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+#define BLANK() asm volatile("\n->" : : )
+#define __NO_STUBS 1
+#undef __SYSCALL
+#undef _ASM_X86_64_UNISTD_H_
+#define __SYSCALL(nr, sym) [nr] = 1,
+static char syscalls[] = {
+#include <asm/unistd.h>
+};
+int main(void)
+{
+#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
+        ENTRY(state);
+        ENTRY(flags); 
+        ENTRY(thread); 
+        ENTRY(pid);
+        BLANK();
+#undef ENTRY
+#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry))
+        ENTRY(flags);
+        ENTRY(addr_limit);
+        ENTRY(preempt_count);
+        ENTRY(status);
+        BLANK();
+#undef ENTRY
+#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
+        ENTRY(kernelstack); 
+        ENTRY(oldrsp); 
+        ENTRY(pcurrent); 
+        ENTRY(irqcount);
+        ENTRY(cpunumber);
+        ENTRY(irqstackptr);
+        ENTRY(data_offset);
+        BLANK();
+#undef ENTRY
+#ifdef CONFIG_IA32_EMULATION
+#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
+        ENTRY(eax);
+        ENTRY(ebx);
+        ENTRY(ecx);
+        ENTRY(edx);
+        ENTRY(esi);
+        ENTRY(edi);
+        ENTRY(ebp);
+        ENTRY(esp);
+        ENTRY(eip);
+        BLANK();
+#undef ENTRY
+        DEFINE(IA32_RT_SIGFRAME_sigcontext,
+               offsetof (struct rt_sigframe32, uc.uc_mcontext));
+        BLANK();
+#endif
+        DEFINE(pbe_address, offsetof(struct pbe, address));
+        DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
+        DEFINE(pbe_next, offsetof(struct pbe, next));
+        BLANK();
+        DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
+        BLANK();
+        DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
+        BLANK();
+        DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+        return 0;
+}
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
new file mode 100644
index 000000000000..06d3e5a14d9d
--- /dev/null
+++ b/arch/x86/kernel/audit_64.c
@@ -0,0 +1,81 @@
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/audit.h>
+#include <asm/unistd.h>
+static unsigned dir_class[] = {
+#include <asm-generic/audit_dir_write.h>
+~0U
+};
+static unsigned read_class[] = {
+#include <asm-generic/audit_read.h>
+~0U
+};
+static unsigned write_class[] = {
+#include <asm-generic/audit_write.h>
+~0U
+};
+static unsigned chattr_class[] = {
+#include <asm-generic/audit_change_attr.h>
+~0U
+};
+static unsigned signal_class[] = {
+#include <asm-generic/audit_signal.h>
+~0U
+};
+int audit_classify_arch(int arch)
+{
+#ifdef CONFIG_IA32_EMULATION
+        if (arch == AUDIT_ARCH_I386)
+                return 1;
+#endif
+        return 0;
+}
+int audit_classify_syscall(int abi, unsigned syscall)
+{
+#ifdef CONFIG_IA32_EMULATION
+        extern int ia32_classify_syscall(unsigned);
+        if (abi == AUDIT_ARCH_I386)
+                return ia32_classify_syscall(syscall);
+#endif
+        switch(syscall) {
+        case __NR_open:
+                return 2;
+        case __NR_openat:
+                return 3;
+        case __NR_execve:
+                return 5;
+        default:
+                return 0;
+        }
+}
+static int __init audit_classes_init(void)
+{
+#ifdef CONFIG_IA32_EMULATION
+        extern __u32 ia32_dir_class[];
+        extern __u32 ia32_write_class[];
+        extern __u32 ia32_read_class[];
+        extern __u32 ia32_chattr_class[];
+        extern __u32 ia32_signal_class[];
+        audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
+        audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
+        audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
+        audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class);
+        audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class);
+#endif
+        audit_register_class(AUDIT_CLASS_WRITE, write_class);
+        audit_register_class(AUDIT_CLASS_READ, read_class);
+        audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
+        audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
+        audit_register_class(AUDIT_CLASS_SIGNAL, signal_class);
+        return 0;
+}
+__initcall(audit_classes_init);
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
new file mode 100644
index 000000000000..0b9860530a6b
--- /dev/null
+++ b/arch/x86/kernel/bootflag.c
@@ -0,0 +1,98 @@
+/*
+ *      Implement 'Simple Boot Flag Specification 2.0'
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/acpi.h>
+#include <asm/io.h>
+#include <linux/mc146818rtc.h>
+#define SBF_RESERVED (0x78)
+#define SBF_PNPOS    (1<<0)
+#define SBF_BOOTING  (1<<1)
+#define SBF_DIAG     (1<<2)
+#define SBF_PARITY   (1<<7)
+int sbf_port __initdata = -1;   /* set via acpi_boot_init() */
+static int __init parity(u8 v)
+{
+        int x = 0;
+        int i;
+        
+        for(i=0;i<8;i++)
+        {
+                x^=(v&1);
+                v>>=1;
+        }
+        return x;
+}
+static void __init sbf_write(u8 v)
+{
+        unsigned long flags;
+        if(sbf_port != -1)
+        {
+                v &= ~SBF_PARITY;
+                if(!parity(v))
+                        v|=SBF_PARITY;
+                printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v);
+                spin_lock_irqsave(&rtc_lock, flags);
+                CMOS_WRITE(v, sbf_port);
+                spin_unlock_irqrestore(&rtc_lock, flags);
+        }
+}
+static u8 __init sbf_read(void)
+{
+        u8 v;
+        unsigned long flags;
+        if(sbf_port == -1)
+                return 0;
+        spin_lock_irqsave(&rtc_lock, flags);
+        v = CMOS_READ(sbf_port);
+        spin_unlock_irqrestore(&rtc_lock, flags);
+        return v;
+}
+static int __init sbf_value_valid(u8 v)
+{
+        if(v&SBF_RESERVED)              /* Reserved bits */
+                return 0;
+        if(!parity(v))
+                return 0;
+        return 1;
+}
+static int __init sbf_init(void)
+{
+        u8 v;
+        if(sbf_port == -1)
+                return 0;
+        v = sbf_read();
+        if(!sbf_value_valid(v))
+                printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v);
+        v &= ~SBF_RESERVED;
+        v &= ~SBF_BOOTING;
+        v &= ~SBF_DIAG;
+#if defined(CONFIG_ISAPNP)
+        v |= SBF_PNPOS;
+#endif
+        sbf_write(v);
+        return 0;
+}
+module_init(sbf_init);
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c
new file mode 100644
index 000000000000..4e5e9d364d63
--- /dev/null
+++ b/arch/x86/kernel/bugs_64.c
@@ -0,0 +1,24 @@
+/*
+ *  arch/x86_64/kernel/bugs.c
+ *
+ *  Copyright (C) 1994  Linus Torvalds
+ *  Copyright (C) 2000  SuSE
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <asm/alternative.h>
+#include <asm/bugs.h>
+#include <asm/processor.h>
+#include <asm/mtrr.h>
+void __init check_bugs(void)
+{
+        identify_cpu(&boot_cpu_data);
+        mtrr_bp_init();
+#if !defined(CONFIG_SMP)
+        printk("CPU: ");
+        print_cpu_info(&boot_cpu_data);
+#endif
+        alternative_instructions();
+}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
new file mode 100644
index 000000000000..778396c78d65
--- /dev/null
+++ b/arch/x86/kernel/cpu/Makefile
@@ -0,0 +1,20 @@
+#
+# Makefile for x86-compatible CPU details and quirks
+#
+obj-y   :=      common.o proc.o bugs.o
+obj-y   +=      amd.o
+obj-y   +=      cyrix.o
+obj-y   +=      centaur.o
+obj-y   +=      transmeta.o
+obj-y   +=      intel.o intel_cacheinfo.o addon_cpuid_features.o
+obj-y   +=      nexgen.o
+obj-y   +=      umc.o
+obj-$(CONFIG_X86_MCE)   +=      mcheck/
+obj-$(CONFIG_MTRR)      +=      mtrr/
+obj-$(CONFIG_CPU_FREQ)  +=      cpufreq/
+obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
new file mode 100644
index 000000000000..3e91d3ee26ec
--- /dev/null
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -0,0 +1,50 @@
+/*
+ *      Routines to indentify additional cpu features that are scattered in
+ *      cpuid space.
+ */
+#include <linux/cpu.h>
+#include <asm/processor.h>
+struct cpuid_bit {
+        u16 feature;
+        u8 reg;
+        u8 bit;
+        u32 level;
+};
+enum cpuid_regs {
+        CR_EAX = 0,
+        CR_ECX,
+        CR_EDX,
+        CR_EBX
+};
+void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+{
+        u32 max_level;
+        u32 regs[4];
+        const struct cpuid_bit *cb;
+        static const struct cpuid_bit cpuid_bits[] = {
+                { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
+                { 0, 0, 0, 0 }
+        };
+        for (cb = cpuid_bits; cb->feature; cb++) {
+                /* Verify that the level is valid */
+                max_level = cpuid_eax(cb->level & 0xffff0000);
+                if (max_level < cb->level ||
+                    max_level > (cb->level | 0xffff))
+                        continue;
+                cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
+                        &regs[CR_ECX], &regs[CR_EDX]);
+                if (regs[cb->reg] & (1 << cb->bit))
+                        set_bit(cb->feature, c->x86_capability);
+        }
+}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
new file mode 100644
index 000000000000..dcf6bbb1c7c0
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd.c
@@ -0,0 +1,337 @@
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/mm.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/apic.h>
+#include "cpu.h"
+/*
+ *      B step AMD K6 before B 9730xxxx have hardware bugs that can cause
+ *      misexecution of code under Linux. Owners of such processors should
+ *      contact AMD for precise details and a CPU swap.
+ *
+ *      See     http://www.multimania.com/poulot/k6bug.html
+ *              http://www.amd.com/K6/k6docs/revgd.html
+ *
+ *      The following test is erm.. interesting. AMD neglected to up
+ *      the chip setting when fixing the bug but they also tweaked some
+ *      performance at the same time..
+ */
+ 
+extern void vide(void);
+__asm__(".align 4\nvide: ret");
+#ifdef CONFIG_X86_LOCAL_APIC
+#define ENABLE_C1E_MASK         0x18000000
+#define CPUID_PROCESSOR_SIGNATURE       1
+#define CPUID_XFAM              0x0ff00000
+#define CPUID_XFAM_K8           0x00000000
+#define CPUID_XFAM_10H          0x00100000
+#define CPUID_XFAM_11H          0x00200000
+#define CPUID_XMOD              0x000f0000
+#define CPUID_XMOD_REV_F        0x00040000
+/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
+static __cpuinit int amd_apic_timer_broken(void)
+{
+        u32 lo, hi;
+        u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+        switch (eax & CPUID_XFAM) {
+        case CPUID_XFAM_K8:
+                if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
+                        break;
+        case CPUID_XFAM_10H:
+        case CPUID_XFAM_11H:
+                rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
+                if (lo & ENABLE_C1E_MASK)
+                        return 1;
+                break;
+        default:
+                /* err on the side of caution */
+                return 1;
+        }
+        return 0;
+}
+#endif
+int force_mwait __cpuinitdata;
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+{
+        u32 l, h;
+        int mbytes = num_physpages >> (20-PAGE_SHIFT);
+        int r;
+#ifdef CONFIG_SMP
+        unsigned long long value;
+        /* Disable TLB flush filter by setting HWCR.FFDIS on K8
+         * bit 6 of msr C001_0015
+         *
+         * Errata 63 for SH-B3 steppings
+         * Errata 122 for all steppings (F+ have it disabled by default)
+         */
+        if (c->x86 == 15) {
+                rdmsrl(MSR_K7_HWCR, value);
+                value |= 1 << 6;
+                wrmsrl(MSR_K7_HWCR, value);
+        }
+#endif
+        /*
+         *      FIXME: We should handle the K5 here. Set up the write
+         *      range and also turn on MSR 83 bits 4 and 31 (write alloc,
+         *      no bus pipeline)
+         */
+        /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+           3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+        clear_bit(0*32+31, c->x86_capability);
+        
+        r = get_model_name(c);
+        switch(c->x86)
+        {
+                case 4:
+                /*
+                 * General Systems BIOSen alias the cpu frequency registers
+                 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
+                 * drivers subsequently pokes it, and changes the CPU speed.
+                 * Workaround : Remove the unneeded alias.
+                 */
+#define CBAR            (0xfffc) /* Configuration Base Address  (32-bit) */
+#define CBAR_ENB        (0x80000000)
+#define CBAR_KEY        (0X000000CB)
+                        if (c->x86_model==9 || c->x86_model == 10) {
+                                if (inl (CBAR) & CBAR_ENB)
+                                        outl (0 | CBAR_KEY, CBAR);
+                        }
+                        break;
+                case 5:
+                        if( c->x86_model < 6 )
+                        {
+                                /* Based on AMD doc 20734R - June 2000 */
+                                if ( c->x86_model == 0 ) {
+                                        clear_bit(X86_FEATURE_APIC, c->x86_capability);
+                                        set_bit(X86_FEATURE_PGE, c->x86_capability);
+                                }
+                                break;
+                        }
+                        
+                        if ( c->x86_model == 6 && c->x86_mask == 1 ) {
+                                const int K6_BUG_LOOP = 1000000;
+                                int n;
+                                void (*f_vide)(void);
+                                unsigned long d, d2;
+                                
+                                printk(KERN_INFO "AMD K6 stepping B detected - ");
+                                
+                                /*
+                                 * It looks like AMD fixed the 2.6.2 bug and improved indirect 
+                                 * calls at the same time.
+                                 */
+                                n = K6_BUG_LOOP;
+                                f_vide = vide;
+                                rdtscl(d);
+                                while (n--) 
+                                        f_vide();
+                                rdtscl(d2);
+                                d = d2-d;
+                                if (d > 20*K6_BUG_LOOP) 
+                                        printk("system stability may be impaired when more than 32 MB are used.\n");
+                                else 
+                                        printk("probably OK (after B9730xxxx).\n");
+                                printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
+                        }
+                        /* K6 with old style WHCR */
+                        if (c->x86_model < 8 ||
+                           (c->x86_model== 8 && c->x86_mask < 8)) {
+                                /* We can only write allocate on the low 508Mb */
+                                if(mbytes>508)
+                                        mbytes=508;
+                                rdmsr(MSR_K6_WHCR, l, h);
+                                if ((l&0x0000FFFF)==0) {
+                                        unsigned long flags;
+                                        l=(1<<0)|((mbytes/4)<<1);
+                                        local_irq_save(flags);
+                                        wbinvd();
+                                        wrmsr(MSR_K6_WHCR, l, h);
+                                        local_irq_restore(flags);
+                                        printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
+                                                mbytes);
+                                }
+                                break;
+                        }
+                        if ((c->x86_model == 8 && c->x86_mask >7) ||
+                             c->x86_model == 9 || c->x86_model == 13) {
+                                /* The more serious chips .. */
+                                if(mbytes>4092)
+                                        mbytes=4092;
+                                rdmsr(MSR_K6_WHCR, l, h);
+                                if ((l&0xFFFF0000)==0) {
+                                        unsigned long flags;
+                                        l=((mbytes>>2)<<22)|(1<<16);
+                                        local_irq_save(flags);
+                                        wbinvd();
+                                        wrmsr(MSR_K6_WHCR, l, h);
+                                        local_irq_restore(flags);
+                                        printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
+                                                mbytes);
+                                }
+                                /*  Set MTRR capability flag if appropriate */
+                                if (c->x86_model == 13 || c->x86_model == 9 ||
+                                   (c->x86_model == 8 && c->x86_mask >= 8))
+                                        set_bit(X86_FEATURE_K6_MTRR, c->x86_capability);
+                                break;
+                        }
+                        if (c->x86_model == 10) {
+                                /* AMD Geode LX is model 10 */
+                                /* placeholder for any needed mods */
+                                break;
+                        }
+                        break;
+                case 6: /* An Athlon/Duron */
+ 
+                        /* Bit 15 of Athlon specific MSR 15, needs to be 0
+                         * to enable SSE on Palomino/Morgan/Barton CPU's.
+                         * If the BIOS didn't enable it already, enable it here.
+                         */
+                        if (c->x86_model >= 6 && c->x86_model <= 10) {
+                                if (!cpu_has(c, X86_FEATURE_XMM)) {
+                                        printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+                                        rdmsr(MSR_K7_HWCR, l, h);
+                                        l &= ~0x00008000;
+                                        wrmsr(MSR_K7_HWCR, l, h);
+                                        set_bit(X86_FEATURE_XMM, c->x86_capability);
+                                }
+                        }
+                        /* It's been determined by AMD that Athlons since model 8 stepping 1
+                         * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
+                         * As per AMD technical note 27212 0.2
+                         */
+                        if ((c->x86_model == 8 && c->x86_mask>=1) || (c->x86_model > 8)) {
+                                rdmsr(MSR_K7_CLK_CTL, l, h);
+                                if ((l & 0xfff00000) != 0x20000000) {
+                                        printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
+                                                ((l & 0x000fffff)|0x20000000));
+                                        wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
+                                }
+                        }
+                        break;
+        }
+        switch (c->x86) {
+        case 15:
+        /* Use K8 tuning for Fam10h and Fam11h */
+        case 0x10:
+        case 0x11:
+                set_bit(X86_FEATURE_K8, c->x86_capability);
+                break;
+        case 6:
+                set_bit(X86_FEATURE_K7, c->x86_capability); 
+                break;
+        }
+        if (c->x86 >= 6)
+                set_bit(X86_FEATURE_FXSAVE_LEAK, c->x86_capability);
+        display_cacheinfo(c);
+        if (cpuid_eax(0x80000000) >= 0x80000008) {
+                c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
+        }
+        if (cpuid_eax(0x80000000) >= 0x80000007) {
+                c->x86_power = cpuid_edx(0x80000007);
+                if (c->x86_power & (1<<8))
+                        set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
+        }
+#ifdef CONFIG_X86_HT
+        /*
+         * On a AMD multi core setup the lower bits of the APIC id
+         * distingush the cores.
+         */
+        if (c->x86_max_cores > 1) {
+                int cpu = smp_processor_id();
+                unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
+                if (bits == 0) {
+                        while ((1 << bits) < c->x86_max_cores)
+                                bits++;
+                }
+                c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
+                c->phys_proc_id >>= bits;
+                printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
+                       cpu, c->x86_max_cores, c->cpu_core_id);
+        }
+#endif
+        if (cpuid_eax(0x80000000) >= 0x80000006) {
+                if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000))
+                        num_cache_leaves = 4;
+                else
+                        num_cache_leaves = 3;
+        }
+#ifdef CONFIG_X86_LOCAL_APIC
+        if (amd_apic_timer_broken())
+                local_apic_timer_disabled = 1;
+#endif
+        if (c->x86 == 0x10 && !force_mwait)
+                clear_bit(X86_FEATURE_MWAIT, c->x86_capability);
+        /* K6s reports MCEs but don't actually have all the MSRs */
+        if (c->x86 < 6)
+                clear_bit(X86_FEATURE_MCE, c->x86_capability);
+}
+static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+{
+        /* AMD errata T13 (order #21922) */
+        if ((c->x86 == 6)) {
+                if (c->x86_model == 3 && c->x86_mask == 0)      /* Duron Rev A0 */
+                        size = 64;
+                if (c->x86_model == 4 &&
+                    (c->x86_mask==0 || c->x86_mask==1)) /* Tbird rev A1/A2 */
+                        size = 256;
+        }
+        return size;
+}
+static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+        .c_vendor       = "AMD",
+        .c_ident        = { "AuthenticAMD" },
+        .c_models = {
+                { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
+                  {
+                          [3] = "486 DX/2",
+                          [7] = "486 DX/2-WB",
+                          [8] = "486 DX/4", 
+                          [9] = "486 DX/4-WB", 
+                          [14] = "Am5x86-WT",
+                          [15] = "Am5x86-WB" 
+                  }
+                },
+        },
+        .c_init         = init_amd,
+        .c_size_cache   = amd_size_cache,
+};
+int __init amd_init_cpu(void)
+{
+        cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev;
+        return 0;
+}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
new file mode 100644
index 000000000000..59266f03d1cd
--- /dev/null
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -0,0 +1,192 @@
+/*
+ *  arch/i386/cpu/bugs.c
+ *
+ *  Copyright (C) 1994  Linus Torvalds
+ *
+ *  Cyrix stuff, June 1998 by:
+ *      - Rafael R. Reilova (moved everything from head.S),
+ *        <rreilova@ececs.uc.edu>
+ *      - Channing Corn (tests & fixes),
+ *      - Andrew D. Balsa (code cleanup).
+ */
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <asm/bugs.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/msr.h>
+#include <asm/paravirt.h>
+#include <asm/alternative.h>
+static int __init no_halt(char *s)
+{
+        boot_cpu_data.hlt_works_ok = 0;
+        return 1;
+}
+__setup("no-hlt", no_halt);
+static int __init mca_pentium(char *s)
+{
+        mca_pentium_flag = 1;
+        return 1;
+}
+__setup("mca-pentium", mca_pentium);
+static int __init no_387(char *s)
+{
+        boot_cpu_data.hard_math = 0;
+        write_cr0(0xE | read_cr0());
+        return 1;
+}
+__setup("no387", no_387);
+static double __initdata x = 4195835.0;
+static double __initdata y = 3145727.0;
+/*
+ * This used to check for exceptions..
+ * However, it turns out that to support that,
+ * the XMM trap handlers basically had to
+ * be buggy. So let's have a correct XMM trap
+ * handler, and forget about printing out
+ * some status at boot.
+ *
+ * We should really only care about bugs here
+ * anyway. Not features.
+ */
+static void __init check_fpu(void)
+{
+        if (!boot_cpu_data.hard_math) {
+#ifndef CONFIG_MATH_EMULATION
+                printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
+                printk(KERN_EMERG "Giving up.\n");
+                for (;;) ;
+#endif
+                return;
+        }
+/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */
+        /* Test for the divl bug.. */
+        __asm__("fninit\n\t"
+                "fldl %1\n\t"
+                "fdivl %2\n\t"
+                "fmull %2\n\t"
+                "fldl %1\n\t"
+                "fsubp %%st,%%st(1)\n\t"
+                "fistpl %0\n\t"
+                "fwait\n\t"
+                "fninit"
+                : "=m" (*&boot_cpu_data.fdiv_bug)
+                : "m" (*&x), "m" (*&y));
+        if (boot_cpu_data.fdiv_bug)
+                printk("Hmm, FPU with FDIV bug.\n");
+}
+static void __init check_hlt(void)
+{
+        if (paravirt_enabled())
+                return;
+        printk(KERN_INFO "Checking 'hlt' instruction... ");
+        if (!boot_cpu_data.hlt_works_ok) {
+                printk("disabled\n");
+                return;
+        }
+        halt();
+        halt();
+        halt();
+        halt();
+        printk("OK.\n");
+}
+/*
+ *      Most 386 processors have a bug where a POPAD can lock the
+ *      machine even from user space.
+ */
+static void __init check_popad(void)
+{
+#ifndef CONFIG_X86_POPAD_OK
+        int res, inp = (int) &res;
+        printk(KERN_INFO "Checking for popad bug... ");
+        __asm__ __volatile__(
+          "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
+          : "=&a" (res)
+          : "d" (inp)
+          : "ecx", "edi" );
+        /* If this fails, it means that any user program may lock the CPU hard. Too bad. */
+        if (res != 12345678) printk( "Buggy.\n" );
+                        else printk( "OK.\n" );
+#endif
+}
+/*
+ * Check whether we are able to run this kernel safely on SMP.
+ *
+ * - In order to run on a i386, we need to be compiled for i386
+ *   (for due to lack of "invlpg" and working WP on a i386)
+ * - In order to run on anything without a TSC, we need to be
+ *   compiled for a i486.
+ * - In order to support the local APIC on a buggy Pentium machine,
+ *   we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
+ *   which happens implicitly if compiled for a Pentium or lower
+ *   (unless an advanced selection of CPU features is used) as an
+ *   otherwise config implies a properly working local APIC without
+ *   the need to do extra reads from the APIC.
+*/
+static void __init check_config(void)
+{
+/*
+ * We'd better not be a i386 if we're configured to use some
+ * i486+ only features! (WP works in supervisor mode and the
+ * new "invlpg" and "bswap" instructions)
+ */
+#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP)
+        if (boot_cpu_data.x86 == 3)
+                panic("Kernel requires i486+ for 'invlpg' and other features");
+#endif
+/*
+ * If we configured ourselves for a TSC, we'd better have one!
+ */
+#ifdef CONFIG_X86_TSC
+        if (!cpu_has_tsc && !tsc_disable)
+                panic("Kernel compiled for Pentium+, requires TSC feature!");
+#endif
+/*
+ * If we were told we had a good local APIC, check for buggy Pentia,
+ * i.e. all B steppings and the C2 stepping of P54C when using their
+ * integrated APIC (see 11AP erratum in "Pentium Processor
+ * Specification Update").
+ */
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
+        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
+            && cpu_has_apic
+            && boot_cpu_data.x86 == 5
+            && boot_cpu_data.x86_model == 2
+            && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
+                panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
+#endif
+}
+void __init check_bugs(void)
+{
+        identify_boot_cpu();
+#ifndef CONFIG_SMP
+        printk("CPU: ");
+        print_cpu_info(&boot_cpu_data);
+#endif
+        check_config();
+        check_fpu();
+        check_hlt();
+        check_popad();
+        init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+        alternative_instructions();
+}
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
new file mode 100644
index 000000000000..473eac883c7b
--- /dev/null
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -0,0 +1,471 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/e820.h>
+#include <asm/mtrr.h>
+#include "cpu.h"
+#ifdef CONFIG_X86_OOSTORE
+static u32 __cpuinit power2(u32 x)
+{
+        u32 s=1;
+        while(s<=x)
+                s<<=1;
+        return s>>=1;
+}
+/*
+ *      Set up an actual MCR
+ */
+ 
+static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key)
+{
+        u32 lo, hi;
+        
+        hi = base & ~0xFFF;
+        lo = ~(size-1);         /* Size is a power of 2 so this makes a mask */
+        lo &= ~0xFFF;           /* Remove the ctrl value bits */
+        lo |= key;              /* Attribute we wish to set */
+        wrmsr(reg+MSR_IDT_MCR0, lo, hi);
+        mtrr_centaur_report_mcr(reg, lo, hi);   /* Tell the mtrr driver */
+}
+/*
+ *      Figure what we can cover with MCR's
+ *
+ *      Shortcut: We know you can't put 4Gig of RAM on a winchip
+ */
+static u32 __cpuinit ramtop(void)               /* 16388 */
+{
+        int i;
+        u32 top = 0;
+        u32 clip = 0xFFFFFFFFUL;
+        
+        for (i = 0; i < e820.nr_map; i++) {
+                unsigned long start, end;
+                if (e820.map[i].addr > 0xFFFFFFFFUL)
+                        continue;
+                /*
+                 *      Don't MCR over reserved space. Ignore the ISA hole
+                 *      we frob around that catastrophy already
+                 */
+                                        
+                if (e820.map[i].type == E820_RESERVED)
+                {
+                        if(e820.map[i].addr >= 0x100000UL && e820.map[i].addr < clip)
+                                clip = e820.map[i].addr;
+                        continue;
+                }
+                start = e820.map[i].addr;
+                end = e820.map[i].addr + e820.map[i].size;
+                if (start >= end)
+                        continue;
+                if (end > top)
+                        top = end;
+        }
+        /* Everything below 'top' should be RAM except for the ISA hole.
+           Because of the limited MCR's we want to map NV/ACPI into our
+           MCR range for gunk in RAM 
+           
+           Clip might cause us to MCR insufficient RAM but that is an
+           acceptable failure mode and should only bite obscure boxes with
+           a VESA hole at 15Mb
+           
+           The second case Clip sometimes kicks in is when the EBDA is marked
+           as reserved. Again we fail safe with reasonable results
+        */
+        
+        if(top>clip)
+                top=clip;
+                
+        return top;
+}
+/*
+ *      Compute a set of MCR's to give maximum coverage
+ */
+static int __cpuinit centaur_mcr_compute(int nr, int key)
+{
+        u32 mem = ramtop();
+        u32 root = power2(mem);
+        u32 base = root;
+        u32 top = root;
+        u32 floor = 0;
+        int ct = 0;
+        
+        while(ct<nr)
+        {
+                u32 fspace = 0;
+                /*
+                 *      Find the largest block we will fill going upwards
+                 */
+                u32 high = power2(mem-top);     
+                /*
+                 *      Find the largest block we will fill going downwards
+                 */
+                u32 low = base/2;
+                /*
+                 *      Don't fill below 1Mb going downwards as there
+                 *      is an ISA hole in the way.
+                 */             
+                 
+                if(base <= 1024*1024)
+                        low = 0;
+                        
+                /*
+                 *      See how much space we could cover by filling below
+                 *      the ISA hole
+                 */
+                 
+                if(floor == 0)
+                        fspace = 512*1024;
+                else if(floor ==512*1024)
+                        fspace = 128*1024;
+                /* And forget ROM space */
+                
+                /*
+                 *      Now install the largest coverage we get
+                 */
+                 
+                if(fspace > high && fspace > low)
+                {
+                        centaur_mcr_insert(ct, floor, fspace, key);
+                        floor += fspace;
+                }
+                else if(high > low)
+                {
+                        centaur_mcr_insert(ct, top, high, key);
+                        top += high;
+                }
+                else if(low > 0)
+                {
+                        base -= low;
+                        centaur_mcr_insert(ct, base, low, key);
+                }
+                else break;
+                ct++;
+        }
+        /*
+         *      We loaded ct values. We now need to set the mask. The caller
+         *      must do this bit.
+         */
+         
+        return ct;
+}
+static void __cpuinit centaur_create_optimal_mcr(void)
+{
+        int i;
+        /*
+         *      Allocate up to 6 mcrs to mark as much of ram as possible
+         *      as write combining and weak write ordered.
+         *
+         *      To experiment with: Linux never uses stack operations for 
+         *      mmio spaces so we could globally enable stack operation wc
+         *
+         *      Load the registers with type 31 - full write combining, all
+         *      writes weakly ordered.
+         */
+        int used = centaur_mcr_compute(6, 31);
+        /*
+         *      Wipe unused MCRs
+         */
+         
+        for(i=used;i<8;i++)
+                wrmsr(MSR_IDT_MCR0+i, 0, 0);
+}
+static void __cpuinit winchip2_create_optimal_mcr(void)
+{
+        u32 lo, hi;
+        int i;
+        /*
+         *      Allocate up to 6 mcrs to mark as much of ram as possible
+         *      as write combining, weak store ordered.
+         *
+         *      Load the registers with type 25
+         *              8       -       weak write ordering
+         *              16      -       weak read ordering
+         *              1       -       write combining
+         */
+        int used = centaur_mcr_compute(6, 25);
+        
+        /*
+         *      Mark the registers we are using.
+         */
+         
+        rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
+        for(i=0;i<used;i++)
+                lo|=1<<(9+i);
+        wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
+        
+        /*
+         *      Wipe unused MCRs
+         */
+         
+        for(i=used;i<8;i++)
+                wrmsr(MSR_IDT_MCR0+i, 0, 0);
+}
+/*
+ *      Handle the MCR key on the Winchip 2.
+ */
+static void __cpuinit winchip2_unprotect_mcr(void)
+{
+        u32 lo, hi;
+        u32 key;
+        
+        rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
+        lo&=~0x1C0;     /* blank bits 8-6 */
+        key = (lo>>17) & 7;
+        lo |= key<<6;   /* replace with unlock key */
+        wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
+}
+static void __cpuinit winchip2_protect_mcr(void)
+{
+        u32 lo, hi;
+        
+        rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
+        lo&=~0x1C0;     /* blank bits 8-6 */
+        wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
+}
+#endif /* CONFIG_X86_OOSTORE */
+#define ACE_PRESENT     (1 << 6)
+#define ACE_ENABLED     (1 << 7)
+#define ACE_FCR         (1 << 28)       /* MSR_VIA_FCR */
+#define RNG_PRESENT     (1 << 2)
+#define RNG_ENABLED     (1 << 3)
+#define RNG_ENABLE      (1 << 6)        /* MSR_VIA_RNG */
+static void __cpuinit init_c3(struct cpuinfo_x86 *c)
+{
+        u32  lo, hi;
+        /* Test for Centaur Extended Feature Flags presence */
+        if (cpuid_eax(0xC0000000) >= 0xC0000001) {
+                u32 tmp = cpuid_edx(0xC0000001);
+                /* enable ACE unit, if present and disabled */
+                if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
+                        rdmsr (MSR_VIA_FCR, lo, hi);
+                        lo |= ACE_FCR;          /* enable ACE unit */
+                        wrmsr (MSR_VIA_FCR, lo, hi);
+                        printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
+                }
+                /* enable RNG unit, if present and disabled */
+                if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
+                        rdmsr (MSR_VIA_RNG, lo, hi);
+                        lo |= RNG_ENABLE;       /* enable RNG unit */
+                        wrmsr (MSR_VIA_RNG, lo, hi);
+                        printk(KERN_INFO "CPU: Enabled h/w RNG\n");
+                }
+                /* store Centaur Extended Feature Flags as
+                 * word 5 of the CPU capability bit array
+                 */
+                c->x86_capability[5] = cpuid_edx(0xC0000001);
+        }
+        /* Cyrix III family needs CX8 & PGE explicity enabled. */
+        if (c->x86_model >=6 && c->x86_model <= 9) {
+                rdmsr (MSR_VIA_FCR, lo, hi);
+                lo |= (1<<1 | 1<<7);
+                wrmsr (MSR_VIA_FCR, lo, hi);
+                set_bit(X86_FEATURE_CX8, c->x86_capability);
+        }
+        /* Before Nehemiah, the C3's had 3dNOW! */
+        if (c->x86_model >=6 && c->x86_model <9)
+                set_bit(X86_FEATURE_3DNOW, c->x86_capability);
+        get_model_name(c);
+        display_cacheinfo(c);
+}
+static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+{
+        enum {
+                ECX8=1<<1,
+                EIERRINT=1<<2,
+                DPM=1<<3,
+                DMCE=1<<4,
+                DSTPCLK=1<<5,
+                ELINEAR=1<<6,
+                DSMC=1<<7,
+                DTLOCK=1<<8,
+                EDCTLB=1<<8,
+                EMMX=1<<9,
+                DPDC=1<<11,
+                EBRPRED=1<<12,
+                DIC=1<<13,
+                DDC=1<<14,
+                DNA=1<<15,
+                ERETSTK=1<<16,
+                E2MMX=1<<19,
+                EAMD3D=1<<20,
+        };
+        char *name;
+        u32  fcr_set=0;
+        u32  fcr_clr=0;
+        u32  lo,hi,newlo;
+        u32  aa,bb,cc,dd;
+        /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+           3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+        clear_bit(0*32+31, c->x86_capability);
+        switch (c->x86) {
+                case 5:
+                        switch(c->x86_model) {
+                        case 4:
+                                name="C6";
+                                fcr_set=ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
+                                fcr_clr=DPDC;
+                                printk(KERN_NOTICE "Disabling bugged TSC.\n");
+                                clear_bit(X86_FEATURE_TSC, c->x86_capability);
+#ifdef CONFIG_X86_OOSTORE
+                                centaur_create_optimal_mcr();
+                                /* Enable
+                                        write combining on non-stack, non-string
+                                        write combining on string, all types
+                                        weak write ordering 
+                                        
+                                   The C6 original lacks weak read order 
+                                   
+                                   Note 0x120 is write only on Winchip 1 */
+                                   
+                                wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0);
+#endif                          
+                                break;
+                        case 8:
+                                switch(c->x86_mask) {
+                                default:
+                                        name="2";
+                                        break;
+                                case 7 ... 9:
+                                        name="2A";
+                                        break;
+                                case 10 ... 15:
+                                        name="2B";
+                                        break;
+                                }
+                                fcr_set=ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|E2MMX|EAMD3D;
+                                fcr_clr=DPDC;
+#ifdef CONFIG_X86_OOSTORE
+                                winchip2_unprotect_mcr();
+                                winchip2_create_optimal_mcr();
+                                rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
+                                /* Enable
+                                        write combining on non-stack, non-string
+                                        write combining on string, all types
+                                        weak write ordering 
+                                */
+                                lo|=31;                         
+                                wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
+                                winchip2_protect_mcr();
+#endif
+                                break;
+                        case 9:
+                                name="3";
+                                fcr_set=ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|E2MMX|EAMD3D;
+                                fcr_clr=DPDC;
+#ifdef CONFIG_X86_OOSTORE
+                                winchip2_unprotect_mcr();
+                                winchip2_create_optimal_mcr();
+                                rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
+                                /* Enable
+                                        write combining on non-stack, non-string
+                                        write combining on string, all types
+                                        weak write ordering 
+                                */
+                                lo|=31;                         
+                                wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
+                                winchip2_protect_mcr();
+#endif
+                                break;
+                        default:
+                                name="??";
+                        }
+                        rdmsr(MSR_IDT_FCR1, lo, hi);
+                        newlo=(lo|fcr_set) & (~fcr_clr);
+                        if (newlo!=lo) {
+                                printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n", lo, newlo );
+                                wrmsr(MSR_IDT_FCR1, newlo, hi );
+                        } else {
+                                printk(KERN_INFO "Centaur FCR is 0x%X\n",lo);
+                        }
+                        /* Emulate MTRRs using Centaur's MCR. */
+                        set_bit(X86_FEATURE_CENTAUR_MCR, c->x86_capability);
+                        /* Report CX8 */
+                        set_bit(X86_FEATURE_CX8, c->x86_capability);
+                        /* Set 3DNow! on Winchip 2 and above. */
+                        if (c->x86_model >=8)
+                                set_bit(X86_FEATURE_3DNOW, c->x86_capability);
+                        /* See if we can find out some more. */
+                        if ( cpuid_eax(0x80000000) >= 0x80000005 ) {
+                                /* Yes, we can. */
+                                cpuid(0x80000005,&aa,&bb,&cc,&dd);
+                                /* Add L1 data and code cache sizes. */
+                                c->x86_cache_size = (cc>>24)+(dd>>24);
+                        }
+                        sprintf( c->x86_model_id, "WinChip %s", name );
+                        break;
+                case 6:
+                        init_c3(c);
+                        break;
+        }
+}
+static unsigned int __cpuinit centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+{
+        /* VIA C3 CPUs (670-68F) need further shifting. */
+        if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
+                size >>= 8;
+        /* VIA also screwed up Nehemiah stepping 1, and made
+           it return '65KB' instead of '64KB'
+           - Note, it seems this may only be in engineering samples. */
+        if ((c->x86==6) && (c->x86_model==9) && (c->x86_mask==1) && (size==65))
+                size -=1;
+        return size;
+}
+static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+        .c_vendor       = "Centaur",
+        .c_ident        = { "CentaurHauls" },
+        .c_init         = init_centaur,
+        .c_size_cache   = centaur_size_cache,
+};
+int __init centaur_init_cpu(void)
+{
+        cpu_devs[X86_VENDOR_CENTAUR] = &centaur_cpu_dev;
+        return 0;
+}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
new file mode 100644
index 000000000000..d506201d397c
--- /dev/null
+++ b/arch/x86/kernel/cpu/common.c
@@ -0,0 +1,733 @@
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/bootmem.h>
+#include <asm/semaphore.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/mtrr.h>
+#include <asm/mce.h>
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <mach_apic.h>
+#endif
+#include "cpu.h"
+DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+        [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
+        [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
+        [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
+        [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
+        /*
+         * Segments used for calling PnP BIOS have byte granularity.
+         * They code segments and data segments have fixed 64k limits,
+         * the transfer segment sizes are set at run time.
+         */
+        [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+        [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
+        [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
+        [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
+        [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
+        /*
+         * The APM segments have byte granularity and their bases
+         * are set at run time.  All have 64k limits.
+         */
+        [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+        /* 16-bit code */
+        [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
+        [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
+        [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
+        [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
+} };
+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+static int cachesize_override __cpuinitdata = -1;
+static int disable_x86_fxsr __cpuinitdata;
+static int disable_x86_serial_nr __cpuinitdata = 1;
+static int disable_x86_sep __cpuinitdata;
+struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
+extern int disable_pse;
+static void __cpuinit default_init(struct cpuinfo_x86 * c)
+{
+        /* Not much we can do here... */
+        /* Check if at least it has cpuid */
+        if (c->cpuid_level == -1) {
+                /* No cpuid. It must be an ancient CPU */
+                if (c->x86 == 4)
+                        strcpy(c->x86_model_id, "486");
+                else if (c->x86 == 3)
+                        strcpy(c->x86_model_id, "386");
+        }
+}
+static struct cpu_dev __cpuinitdata default_cpu = {
+        .c_init = default_init,
+        .c_vendor = "Unknown",
+};
+static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu;
+static int __init cachesize_setup(char *str)
+{
+        get_option (&str, &cachesize_override);
+        return 1;
+}
+__setup("cachesize=", cachesize_setup);
+int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+{
+        unsigned int *v;
+        char *p, *q;
+        if (cpuid_eax(0x80000000) < 0x80000004)
+                return 0;
+        v = (unsigned int *) c->x86_model_id;
+        cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+        cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+        cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+        c->x86_model_id[48] = 0;
+        /* Intel chips right-justify this string for some dumb reason;
+           undo that brain damage */
+        p = q = &c->x86_model_id[0];
+        while ( *p == ' ' )
+             p++;
+        if ( p != q ) {
+             while ( *p )
+                  *q++ = *p++;
+             while ( q <= &c->x86_model_id[48] )
+                  *q++ = '\0';  /* Zero-pad the rest */
+        }
+        return 1;
+}
+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+{
+        unsigned int n, dummy, ecx, edx, l2size;
+        n = cpuid_eax(0x80000000);
+        if (n >= 0x80000005) {
+                cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
+                printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
+                        edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+                c->x86_cache_size=(ecx>>24)+(edx>>24);  
+        }
+        if (n < 0x80000006)     /* Some chips just has a large L1. */
+                return;
+        ecx = cpuid_ecx(0x80000006);
+        l2size = ecx >> 16;
+        
+        /* do processor-specific cache resizing */
+        if (this_cpu->c_size_cache)
+                l2size = this_cpu->c_size_cache(c,l2size);
+        /* Allow user to override all this if necessary. */
+        if (cachesize_override != -1)
+                l2size = cachesize_override;
+        if ( l2size == 0 )
+                return;         /* Again, no L2 cache is possible */
+        c->x86_cache_size = l2size;
+        printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+               l2size, ecx & 0xFF);
+}
+/* Naming convention should be: <Name> [(<Codename>)] */
+/* This table only is used unless init_<vendor>() below doesn't set it; */
+/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
+/* Look up CPU names by table lookup. */
+static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+{
+        struct cpu_model_info *info;
+        if ( c->x86_model >= 16 )
+                return NULL;    /* Range check */
+        if (!this_cpu)
+                return NULL;
+        info = this_cpu->c_models;
+        while (info && info->family) {
+                if (info->family == c->x86)
+                        return info->model_names[c->x86_model];
+                info++;
+        }
+        return NULL;            /* Not found */
+}
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
+{
+        char *v = c->x86_vendor_id;
+        int i;
+        static int printed;
+        for (i = 0; i < X86_VENDOR_NUM; i++) {
+                if (cpu_devs[i]) {
+                        if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
+                            (cpu_devs[i]->c_ident[1] && 
+                             !strcmp(v,cpu_devs[i]->c_ident[1]))) {
+                                c->x86_vendor = i;
+                                if (!early)
+                                        this_cpu = cpu_devs[i];
+                                return;
+                        }
+                }
+        }
+        if (!printed) {
+                printed++;
+                printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
+                printk(KERN_ERR "CPU: Your system may be unstable.\n");
+        }
+        c->x86_vendor = X86_VENDOR_UNKNOWN;
+        this_cpu = &default_cpu;
+}
+static int __init x86_fxsr_setup(char * s)
+{
+        /* Tell all the other CPU's to not use it... */
+        disable_x86_fxsr = 1;
+        /*
+         * ... and clear the bits early in the boot_cpu_data
+         * so that the bootup process doesn't try to do this
+         * either.
+         */
+        clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
+        clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
+        return 1;
+}
+__setup("nofxsr", x86_fxsr_setup);
+static int __init x86_sep_setup(char * s)
+{
+        disable_x86_sep = 1;
+        return 1;
+}
+__setup("nosep", x86_sep_setup);
+/* Standard macro to see if a specific flag is changeable */
+static inline int flag_is_changeable_p(u32 flag)
+{
+        u32 f1, f2;
+        asm("pushfl\n\t"
+            "pushfl\n\t"
+            "popl %0\n\t"
+            "movl %0,%1\n\t"
+            "xorl %2,%0\n\t"
+            "pushl %0\n\t"
+            "popfl\n\t"
+            "pushfl\n\t"
+            "popl %0\n\t"
+            "popfl\n\t"
+            : "=&r" (f1), "=&r" (f2)
+            : "ir" (flag));
+        return ((f1^f2) & flag) != 0;
+}
+/* Probe for the CPUID instruction */
+static int __cpuinit have_cpuid_p(void)
+{
+        return flag_is_changeable_p(X86_EFLAGS_ID);
+}
+void __init cpu_detect(struct cpuinfo_x86 *c)
+{
+        /* Get vendor name */
+        cpuid(0x00000000, &c->cpuid_level,
+              (int *)&c->x86_vendor_id[0],
+              (int *)&c->x86_vendor_id[8],
+              (int *)&c->x86_vendor_id[4]);
+        c->x86 = 4;
+        if (c->cpuid_level >= 0x00000001) {
+                u32 junk, tfms, cap0, misc;
+                cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
+                c->x86 = (tfms >> 8) & 15;
+                c->x86_model = (tfms >> 4) & 15;
+                if (c->x86 == 0xf)
+                        c->x86 += (tfms >> 20) & 0xff;
+                if (c->x86 >= 0x6)
+                        c->x86_model += ((tfms >> 16) & 0xF) << 4;
+                c->x86_mask = tfms & 15;
+                if (cap0 & (1<<19))
+                        c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
+        }
+}
+/* Do minimum CPU detection early.
+   Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
+   The others are not touched to avoid unwanted side effects.
+   WARNING: this function is only called on the BP.  Don't add code here
+   that is supposed to run on all CPUs. */
+static void __init early_cpu_detect(void)
+{
+        struct cpuinfo_x86 *c = &boot_cpu_data;
+        c->x86_cache_alignment = 32;
+        if (!have_cpuid_p())
+                return;
+        cpu_detect(c);
+        get_cpu_vendor(c, 1);
+}
+static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
+{
+        u32 tfms, xlvl;
+        int ebx;
+        if (have_cpuid_p()) {
+                /* Get vendor name */
+                cpuid(0x00000000, &c->cpuid_level,
+                      (int *)&c->x86_vendor_id[0],
+                      (int *)&c->x86_vendor_id[8],
+                      (int *)&c->x86_vendor_id[4]);
+                
+                get_cpu_vendor(c, 0);
+                /* Initialize the standard set of capabilities */
+                /* Note that the vendor-specific code below might override */
+        
+                /* Intel-defined flags: level 0x00000001 */
+                if ( c->cpuid_level >= 0x00000001 ) {
+                        u32 capability, excap;
+                        cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
+                        c->x86_capability[0] = capability;
+                        c->x86_capability[4] = excap;
+                        c->x86 = (tfms >> 8) & 15;
+                        c->x86_model = (tfms >> 4) & 15;
+                        if (c->x86 == 0xf)
+                                c->x86 += (tfms >> 20) & 0xff;
+                        if (c->x86 >= 0x6)
+                                c->x86_model += ((tfms >> 16) & 0xF) << 4;
+                        c->x86_mask = tfms & 15;
+#ifdef CONFIG_X86_HT
+                        c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
+#else
+                        c->apicid = (ebx >> 24) & 0xFF;
+#endif
+                        if (c->x86_capability[0] & (1<<19))
+                                c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
+                } else {
+                        /* Have CPUID level 0 only - unheard of */
+                        c->x86 = 4;
+                }
+                /* AMD-defined flags: level 0x80000001 */
+                xlvl = cpuid_eax(0x80000000);
+                if ( (xlvl & 0xffff0000) == 0x80000000 ) {
+                        if ( xlvl >= 0x80000001 ) {
+                                c->x86_capability[1] = cpuid_edx(0x80000001);
+                                c->x86_capability[6] = cpuid_ecx(0x80000001);
+                        }
+                        if ( xlvl >= 0x80000004 )
+                                get_model_name(c); /* Default name */
+                }
+                init_scattered_cpuid_features(c);
+        }
+        early_intel_workaround(c);
+#ifdef CONFIG_X86_HT
+        c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
+#endif
+}
+static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+        if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
+                /* Disable processor serial number */
+                unsigned long lo,hi;
+                rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
+                lo |= 0x200000;
+                wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
+                printk(KERN_NOTICE "CPU serial number disabled.\n");
+                clear_bit(X86_FEATURE_PN, c->x86_capability);
+                /* Disabling the serial number may affect the cpuid level */
+                c->cpuid_level = cpuid_eax(0);
+        }
+}
+static int __init x86_serial_nr_setup(char *s)
+{
+        disable_x86_serial_nr = 0;
+        return 1;
+}
+__setup("serialnumber", x86_serial_nr_setup);
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+        int i;
+        c->loops_per_jiffy = loops_per_jiffy;
+        c->x86_cache_size = -1;
+        c->x86_vendor = X86_VENDOR_UNKNOWN;
+        c->cpuid_level = -1;    /* CPUID not detected */
+        c->x86_model = c->x86_mask = 0; /* So far unknown... */
+        c->x86_vendor_id[0] = '\0'; /* Unset */
+        c->x86_model_id[0] = '\0';  /* Unset */
+        c->x86_max_cores = 1;
+        c->x86_clflush_size = 32;
+        memset(&c->x86_capability, 0, sizeof c->x86_capability);
+        if (!have_cpuid_p()) {
+                /* First of all, decide if this is a 486 or higher */
+                /* It's a 486 if we can modify the AC flag */
+                if ( flag_is_changeable_p(X86_EFLAGS_AC) )
+                        c->x86 = 4;
+                else
+                        c->x86 = 3;
+        }
+        generic_identify(c);
+        printk(KERN_DEBUG "CPU: After generic identify, caps:");
+        for (i = 0; i < NCAPINTS; i++)
+                printk(" %08lx", c->x86_capability[i]);
+        printk("\n");
+        if (this_cpu->c_identify) {
+                this_cpu->c_identify(c);
+                printk(KERN_DEBUG "CPU: After vendor identify, caps:");
+                for (i = 0; i < NCAPINTS; i++)
+                        printk(" %08lx", c->x86_capability[i]);
+                printk("\n");
+        }
+        /*
+         * Vendor-specific initialization.  In this section we
+         * canonicalize the feature flags, meaning if there are
+         * features a certain CPU supports which CPUID doesn't
+         * tell us, CPUID claiming incorrect flags, or other bugs,
+         * we handle them here.
+         *
+         * At the end of this section, c->x86_capability better
+         * indicate the features this CPU genuinely supports!
+         */
+        if (this_cpu->c_init)
+                this_cpu->c_init(c);
+        /* Disable the PN if appropriate */
+        squash_the_stupid_serial_number(c);
+        /*
+         * The vendor-specific functions might have changed features.  Now
+         * we do "generic changes."
+         */
+        /* TSC disabled? */
+        if ( tsc_disable )
+                clear_bit(X86_FEATURE_TSC, c->x86_capability);
+        /* FXSR disabled? */
+        if (disable_x86_fxsr) {
+                clear_bit(X86_FEATURE_FXSR, c->x86_capability);
+                clear_bit(X86_FEATURE_XMM, c->x86_capability);
+        }
+        /* SEP disabled? */
+        if (disable_x86_sep)
+                clear_bit(X86_FEATURE_SEP, c->x86_capability);
+        if (disable_pse)
+                clear_bit(X86_FEATURE_PSE, c->x86_capability);
+        /* If the model name is still unset, do table lookup. */
+        if ( !c->x86_model_id[0] ) {
+                char *p;
+                p = table_lookup_model(c);
+                if ( p )
+                        strcpy(c->x86_model_id, p);
+                else
+                        /* Last resort... */
+                        sprintf(c->x86_model_id, "%02x/%02x",
+                                c->x86, c->x86_model);
+        }
+        /* Now the feature flags better reflect actual CPU features! */
+        printk(KERN_DEBUG "CPU: After all inits, caps:");
+        for (i = 0; i < NCAPINTS; i++)
+                printk(" %08lx", c->x86_capability[i]);
+        printk("\n");
+        /*
+         * On SMP, boot_cpu_data holds the common feature set between
+         * all CPUs; so make sure that we indicate which features are
+         * common between the CPUs.  The first time this routine gets
+         * executed, c == &boot_cpu_data.
+         */
+        if ( c != &boot_cpu_data ) {
+                /* AND the already accumulated flags with these */
+                for ( i = 0 ; i < NCAPINTS ; i++ )
+                        boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+        }
+        /* Init Machine Check Exception if available. */
+        mcheck_init(c);
+}
+void __init identify_boot_cpu(void)
+{
+        identify_cpu(&boot_cpu_data);
+        sysenter_setup();
+        enable_sep_cpu();
+        mtrr_bp_init();
+}
+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+        BUG_ON(c == &boot_cpu_data);
+        identify_cpu(c);
+        enable_sep_cpu();
+        mtrr_ap_init();
+}
+#ifdef CONFIG_X86_HT
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+{
+        u32     eax, ebx, ecx, edx;
+        int     index_msb, core_bits;
+        cpuid(1, &eax, &ebx, &ecx, &edx);
+        if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
+                return;
+        smp_num_siblings = (ebx & 0xff0000) >> 16;
+        if (smp_num_siblings == 1) {
+                printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
+        } else if (smp_num_siblings > 1 ) {
+                if (smp_num_siblings > NR_CPUS) {
+                        printk(KERN_WARNING "CPU: Unsupported number of the "
+                                        "siblings %d", smp_num_siblings);
+                        smp_num_siblings = 1;
+                        return;
+                }
+                index_msb = get_count_order(smp_num_siblings);
+                c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
+                printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+                       c->phys_proc_id);
+                smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+                index_msb = get_count_order(smp_num_siblings) ;
+                core_bits = get_count_order(c->x86_max_cores);
+                c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
+                                               ((1 << core_bits) - 1);
+                if (c->x86_max_cores > 1)
+                        printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+                               c->cpu_core_id);
+        }
+}
+#endif
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+{
+        char *vendor = NULL;
+        if (c->x86_vendor < X86_VENDOR_NUM)
+                vendor = this_cpu->c_vendor;
+        else if (c->cpuid_level >= 0)
+                vendor = c->x86_vendor_id;
+        if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
+                printk("%s ", vendor);
+        if (!c->x86_model_id[0])
+                printk("%d86", c->x86);
+        else
+                printk("%s", c->x86_model_id);
+        if (c->x86_mask || c->cpuid_level >= 0) 
+                printk(" stepping %02x\n", c->x86_mask);
+        else
+                printk("\n");
+}
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
+/* This is hacky. :)
+ * We're emulating future behavior.
+ * In the future, the cpu-specific init functions will be called implicitly
+ * via the magic of initcalls.
+ * They will insert themselves into the cpu_devs structure.
+ * Then, when cpu_init() is called, we can just iterate over that array.
+ */
+extern int intel_cpu_init(void);
+extern int cyrix_init_cpu(void);
+extern int nsc_init_cpu(void);
+extern int amd_init_cpu(void);
+extern int centaur_init_cpu(void);
+extern int transmeta_init_cpu(void);
+extern int nexgen_init_cpu(void);
+extern int umc_init_cpu(void);
+void __init early_cpu_init(void)
+{
+        intel_cpu_init();
+        cyrix_init_cpu();
+        nsc_init_cpu();
+        amd_init_cpu();
+        centaur_init_cpu();
+        transmeta_init_cpu();
+        nexgen_init_cpu();
+        umc_init_cpu();
+        early_cpu_detect();
+#ifdef CONFIG_DEBUG_PAGEALLOC
+        /* pse is not compatible with on-the-fly unmapping,
+         * disable it even if the cpus claim to support it.
+         */
+        clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+        disable_pse = 1;
+#endif
+}
+/* Make sure %fs is initialized properly in idle threads */
+struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
+{
+        memset(regs, 0, sizeof(struct pt_regs));
+        regs->xfs = __KERNEL_PERCPU;
+        return regs;
+}
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+void switch_to_new_gdt(void)
+{
+        struct Xgt_desc_struct gdt_descr;
+        gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+        gdt_descr.size = GDT_SIZE - 1;
+        load_gdt(&gdt_descr);
+        asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
+}
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ */
+void __cpuinit cpu_init(void)
+{
+        int cpu = smp_processor_id();
+        struct task_struct *curr = current;
+        struct tss_struct * t = &per_cpu(init_tss, cpu);
+        struct thread_struct *thread = &curr->thread;
+        if (cpu_test_and_set(cpu, cpu_initialized)) {
+                printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
+                for (;;) local_irq_enable();
+        }
+        printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+        if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
+                clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+        if (tsc_disable && cpu_has_tsc) {
+                printk(KERN_NOTICE "Disabling TSC...\n");
+                /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
+                clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
+                set_in_cr4(X86_CR4_TSD);
+        }
+        load_idt(&idt_descr);
+        switch_to_new_gdt();
+        /*
+         * Set up and load the per-CPU TSS and LDT
+         */
+        atomic_inc(&init_mm.mm_count);
+        curr->active_mm = &init_mm;
+        if (curr->mm)
+                BUG();
+        enter_lazy_tlb(&init_mm, curr);
+        load_esp0(t, thread);
+        set_tss_desc(cpu,t);
+        load_TR_desc();
+        load_LDT(&init_mm.context);
+#ifdef CONFIG_DOUBLEFAULT
+        /* Set up doublefault TSS pointer in the GDT */
+        __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
+#endif
+        /* Clear %gs. */
+        asm volatile ("mov %0, %%gs" : : "r" (0));
+        /* Clear all 6 debug registers: */
+        set_debugreg(0, 0);
+        set_debugreg(0, 1);
+        set_debugreg(0, 2);
+        set_debugreg(0, 3);
+        set_debugreg(0, 6);
+        set_debugreg(0, 7);
+        /*
+         * Force FPU initialization:
+         */
+        current_thread_info()->status = 0;
+        clear_used_math();
+        mxcsr_feature_mask_init();
+}
+#ifdef CONFIG_HOTPLUG_CPU
+void __cpuinit cpu_uninit(void)
+{
+        int cpu = raw_smp_processor_id();
+        cpu_clear(cpu, cpu_initialized);
+        /* lazy TLB state */
+        per_cpu(cpu_tlbstate, cpu).state = 0;
+        per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
+}
+#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
new file mode 100644
index 000000000000..2f6432cef6ff
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -0,0 +1,28 @@
+struct cpu_model_info {
+        int vendor;
+        int family;
+        char *model_names[16];
+};
+/* attempt to consolidate cpu attributes */
+struct cpu_dev {
+        char    * c_vendor;
+        /* some have two possibilities for cpuid string */
+        char    * c_ident[2];   
+        struct          cpu_model_info c_models[4];
+        void            (*c_init)(struct cpuinfo_x86 * c);
+        void            (*c_identify)(struct cpuinfo_x86 * c);
+        unsigned int    (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
+};
+extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
+extern int get_model_name(struct cpuinfo_x86 *c);
+extern void display_cacheinfo(struct cpuinfo_x86 *c);
+extern void early_intel_workaround(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
new file mode 100644
index 000000000000..d8c6f132dc7a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -0,0 +1,250 @@
+#
+# CPU Frequency scaling
+#
+menu "CPU Frequency scaling"
+source "drivers/cpufreq/Kconfig"
+if CPU_FREQ
+comment "CPUFreq processor drivers"
+config X86_ACPI_CPUFREQ
+        tristate "ACPI Processor P-States driver"
+        select CPU_FREQ_TABLE
+        depends on ACPI_PROCESSOR
+        help
+          This driver adds a CPUFreq driver which utilizes the ACPI
+          Processor Performance States.
+          This driver also supports Intel Enhanced Speedstep.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config ELAN_CPUFREQ
+        tristate "AMD Elan SC400 and SC410"
+        select CPU_FREQ_TABLE
+        depends on X86_ELAN
+        ---help---
+          This adds the CPUFreq driver for AMD Elan SC400 and SC410
+          processors.
+          You need to specify the processor maximum speed as boot
+          parameter: elanfreq=maxspeed (in kHz) or as module
+          parameter "max_freq".
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config SC520_CPUFREQ
+        tristate "AMD Elan SC520"
+        select CPU_FREQ_TABLE
+        depends on X86_ELAN
+        ---help---
+          This adds the CPUFreq driver for AMD Elan SC520 processor.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config X86_POWERNOW_K6
+        tristate "AMD Mobile K6-2/K6-3 PowerNow!"
+        select CPU_FREQ_TABLE
+        help
+          This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
+          AMD K6-3+ processors.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config X86_POWERNOW_K7
+        tristate "AMD Mobile Athlon/Duron PowerNow!"
+        select CPU_FREQ_TABLE
+        help
+          This adds the CPUFreq driver for mobile AMD K7 mobile processors.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config X86_POWERNOW_K7_ACPI
+        bool
+        depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
+        depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
+        default y
+config X86_POWERNOW_K8
+        tristate "AMD Opteron/Athlon64 PowerNow!"
+        select CPU_FREQ_TABLE
+        depends on EXPERIMENTAL
+        help
+          This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config X86_POWERNOW_K8_ACPI
+        bool "ACPI Support"
+        select ACPI_PROCESSOR
+        depends on ACPI && X86_POWERNOW_K8
+        default y
+        help
+          This provides access to the K8s Processor Performance States via ACPI.
+          This driver is probably required for CPUFreq to work with multi-socket and
+          SMP systems.  It is not required on at least some single-socket yet
+          multi-core systems, even if SMP is enabled.
+          It is safe to say Y here.
+config X86_GX_SUSPMOD
+        tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
+        depends on PCI
+        help
+         This add the CPUFreq driver for NatSemi Geode processors which
+         support suspend modulation.
+         For details, take a look at <file:Documentation/cpu-freq/>.
+         If in doubt, say N.
+config X86_SPEEDSTEP_CENTRINO
+        tristate "Intel Enhanced SpeedStep"
+        select CPU_FREQ_TABLE
+        select X86_SPEEDSTEP_CENTRINO_TABLE
+        help
+          This adds the CPUFreq driver for Enhanced SpeedStep enabled
+          mobile CPUs.  This means Intel Pentium M (Centrino) CPUs. However,
+          you also need to say Y to "Use ACPI tables to decode..." below
+          [which might imply enabling ACPI] if you want to use this driver
+          on non-Banias CPUs.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config X86_SPEEDSTEP_CENTRINO_TABLE
+        bool "Built-in tables for Banias CPUs"
+        depends on X86_SPEEDSTEP_CENTRINO
+        default y
+        help
+          Use built-in tables for Banias CPUs if ACPI encoding
+          is not available.
+          If in doubt, say N.
+config X86_SPEEDSTEP_ICH
+        tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
+        select CPU_FREQ_TABLE
+        help
+          This adds the CPUFreq driver for certain mobile Intel Pentium III
+          (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
+          mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
+          ICH3 or ICH4 southbridge.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config X86_SPEEDSTEP_SMI
+        tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
+        select CPU_FREQ_TABLE
+        depends on EXPERIMENTAL
+        help
+          This adds the CPUFreq driver for certain mobile Intel Pentium III
+          (Coppermine), all mobile Intel Pentium III-M (Tualatin)
+          on systems which have an Intel 440BX/ZX/MX southbridge.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config X86_P4_CLOCKMOD
+        tristate "Intel Pentium 4 clock modulation"
+        select CPU_FREQ_TABLE
+        help
+          This adds the CPUFreq driver for Intel Pentium 4 / XEON
+          processors.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config X86_CPUFREQ_NFORCE2
+        tristate "nVidia nForce2 FSB changing"
+        depends on EXPERIMENTAL
+        help
+          This adds the CPUFreq driver for FSB changing on nVidia nForce2
+          platforms.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config X86_LONGRUN
+        tristate "Transmeta LongRun"
+        help
+          This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
+          which support LongRun.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config X86_LONGHAUL
+        tristate "VIA Cyrix III Longhaul"
+        select CPU_FREQ_TABLE
+        depends on ACPI_PROCESSOR
+        help
+          This adds the CPUFreq driver for VIA Samuel/CyrixIII,
+          VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
+          processors.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config X86_E_POWERSAVER
+        tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)"
+        select CPU_FREQ_TABLE
+        depends on EXPERIMENTAL
+        help
+          This adds the CPUFreq driver for VIA C7 processors.
+          If in doubt, say N.
+comment "shared options"
+config X86_ACPI_CPUFREQ_PROC_INTF
+        bool "/proc/acpi/processor/../performance interface (deprecated)"
+        depends on PROC_FS
+        depends on X86_ACPI_CPUFREQ || X86_POWERNOW_K7_ACPI || X86_POWERNOW_K8_ACPI
+        help
+          This enables the deprecated /proc/acpi/processor/../performance
+          interface. While it is helpful for debugging, the generic,
+          cross-architecture cpufreq interfaces should be used.
+          If in doubt, say N.
+config X86_SPEEDSTEP_LIB
+        tristate
+        default X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD
+config X86_SPEEDSTEP_RELAXED_CAP_CHECK
+        bool "Relaxed speedstep capability checks"
+        depends on (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
+        help
+          Don't perform all checks for a speedstep capable system which would
+          normally be done. Some ancient or strange systems, though speedstep
+          capable, don't always indicate that they are speedstep capable. This
+          option lets the probing code bypass some of those checks if the
+          parameter "relaxed_check=1" is passed to the module.
+endif   # CPU_FREQ
+endmenu
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
new file mode 100644
index 000000000000..560f7760dae5
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -0,0 +1,16 @@
+obj-$(CONFIG_X86_POWERNOW_K6)           += powernow-k6.o
+obj-$(CONFIG_X86_POWERNOW_K7)           += powernow-k7.o
+obj-$(CONFIG_X86_POWERNOW_K8)           += powernow-k8.o
+obj-$(CONFIG_X86_LONGHAUL)              += longhaul.o
+obj-$(CONFIG_X86_E_POWERSAVER)          += e_powersaver.o
+obj-$(CONFIG_ELAN_CPUFREQ)              += elanfreq.o
+obj-$(CONFIG_SC520_CPUFREQ)             += sc520_freq.o
+obj-$(CONFIG_X86_LONGRUN)               += longrun.o  
+obj-$(CONFIG_X86_GX_SUSPMOD)            += gx-suspmod.o
+obj-$(CONFIG_X86_SPEEDSTEP_ICH)         += speedstep-ich.o
+obj-$(CONFIG_X86_SPEEDSTEP_LIB)         += speedstep-lib.o
+obj-$(CONFIG_X86_SPEEDSTEP_SMI)         += speedstep-smi.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ)          += acpi-cpufreq.o
+obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO)    += speedstep-centrino.o
+obj-$(CONFIG_X86_P4_CLOCKMOD)           += p4-clockmod.o
+obj-$(CONFIG_X86_CPUFREQ_NFORCE2)       += cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
new file mode 100644
index 000000000000..b6434a7ef8b2
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -0,0 +1,799 @@
+/*
+ * acpi-cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.4 $)
+ *
+ *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
+ *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
+ *  Copyright (C) 2006       Denis Sadykov <denis.m.sadykov@intel.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/cpufreq.h>
+#include <linux/compiler.h>
+#include <linux/dmi.h>
+#include <linux/acpi.h>
+#include <acpi/processor.h>
+#include <asm/io.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+#include <asm/delay.h>
+#include <asm/uaccess.h>
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg)
+MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
+MODULE_DESCRIPTION("ACPI Processor P-States Driver");
+MODULE_LICENSE("GPL");
+enum {
+        UNDEFINED_CAPABLE = 0,
+        SYSTEM_INTEL_MSR_CAPABLE,
+        SYSTEM_IO_CAPABLE,
+};
+#define INTEL_MSR_RANGE         (0xffff)
+#define CPUID_6_ECX_APERFMPERF_CAPABILITY       (0x1)
+struct acpi_cpufreq_data {
+        struct acpi_processor_performance *acpi_data;
+        struct cpufreq_frequency_table *freq_table;
+        unsigned int max_freq;
+        unsigned int resume;
+        unsigned int cpu_feature;
+};
+static struct acpi_cpufreq_data *drv_data[NR_CPUS];
+/* acpi_perf_data is a pointer to percpu data. */
+static struct acpi_processor_performance *acpi_perf_data;
+static struct cpufreq_driver acpi_cpufreq_driver;
+static unsigned int acpi_pstate_strict;
+static int check_est_cpu(unsigned int cpuid)
+{
+        struct cpuinfo_x86 *cpu = &cpu_data[cpuid];
+        if (cpu->x86_vendor != X86_VENDOR_INTEL ||
+            !cpu_has(cpu, X86_FEATURE_EST))
+                return 0;
+        return 1;
+}
+static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
+{
+        struct acpi_processor_performance *perf;
+        int i;
+        perf = data->acpi_data;
+        for (i=0; i<perf->state_count; i++) {
+                if (value == perf->states[i].status)
+                        return data->freq_table[i].frequency;
+        }
+        return 0;
+}
+static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
+{
+        int i;
+        struct acpi_processor_performance *perf;
+        msr &= INTEL_MSR_RANGE;
+        perf = data->acpi_data;
+        for (i=0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
+                if (msr == perf->states[data->freq_table[i].index].status)
+                        return data->freq_table[i].frequency;
+        }
+        return data->freq_table[0].frequency;
+}
+static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
+{
+        switch (data->cpu_feature) {
+        case SYSTEM_INTEL_MSR_CAPABLE:
+                return extract_msr(val, data);
+        case SYSTEM_IO_CAPABLE:
+                return extract_io(val, data);
+        default:
+                return 0;
+        }
+}
+struct msr_addr {
+        u32 reg;
+};
+struct io_addr {
+        u16 port;
+        u8 bit_width;
+};
+typedef union {
+        struct msr_addr msr;
+        struct io_addr io;
+} drv_addr_union;
+struct drv_cmd {
+        unsigned int type;
+        cpumask_t mask;
+        drv_addr_union addr;
+        u32 val;
+};
+static void do_drv_read(struct drv_cmd *cmd)
+{
+        u32 h;
+        switch (cmd->type) {
+        case SYSTEM_INTEL_MSR_CAPABLE:
+                rdmsr(cmd->addr.msr.reg, cmd->val, h);
+                break;
+        case SYSTEM_IO_CAPABLE:
+                acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
+                                &cmd->val,
+                                (u32)cmd->addr.io.bit_width);
+                break;
+        default:
+                break;
+        }
+}
+static void do_drv_write(struct drv_cmd *cmd)
+{
+        u32 lo, hi;
+        switch (cmd->type) {
+        case SYSTEM_INTEL_MSR_CAPABLE:
+                rdmsr(cmd->addr.msr.reg, lo, hi);
+                lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
+                wrmsr(cmd->addr.msr.reg, lo, hi);
+                break;
+        case SYSTEM_IO_CAPABLE:
+                acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
+                                cmd->val,
+                                (u32)cmd->addr.io.bit_width);
+                break;
+        default:
+                break;
+        }
+}
+static void drv_read(struct drv_cmd *cmd)
+{
+        cpumask_t saved_mask = current->cpus_allowed;
+        cmd->val = 0;
+        set_cpus_allowed(current, cmd->mask);
+        do_drv_read(cmd);
+        set_cpus_allowed(current, saved_mask);
+}
+static void drv_write(struct drv_cmd *cmd)
+{
+        cpumask_t saved_mask = current->cpus_allowed;
+        unsigned int i;
+        for_each_cpu_mask(i, cmd->mask) {
+                set_cpus_allowed(current, cpumask_of_cpu(i));
+                do_drv_write(cmd);
+        }
+        set_cpus_allowed(current, saved_mask);
+        return;
+}
+static u32 get_cur_val(cpumask_t mask)
+{
+        struct acpi_processor_performance *perf;
+        struct drv_cmd cmd;
+        if (unlikely(cpus_empty(mask)))
+                return 0;
+        switch (drv_data[first_cpu(mask)]->cpu_feature) {
+        case SYSTEM_INTEL_MSR_CAPABLE:
+                cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
+                cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
+                break;
+        case SYSTEM_IO_CAPABLE:
+                cmd.type = SYSTEM_IO_CAPABLE;
+                perf = drv_data[first_cpu(mask)]->acpi_data;
+                cmd.addr.io.port = perf->control_register.address;
+                cmd.addr.io.bit_width = perf->control_register.bit_width;
+                break;
+        default:
+                return 0;
+        }
+        cmd.mask = mask;
+        drv_read(&cmd);
+        dprintk("get_cur_val = %u\n", cmd.val);
+        return cmd.val;
+}
+/*
+ * Return the measured active (C0) frequency on this CPU since last call
+ * to this function.
+ * Input: cpu number
+ * Return: Average CPU frequency in terms of max frequency (zero on error)
+ *
+ * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
+ * over a period of time, while CPU is in C0 state.
+ * IA32_MPERF counts at the rate of max advertised frequency
+ * IA32_APERF counts at the rate of actual CPU frequency
+ * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
+ * no meaning should be associated with absolute values of these MSRs.
+ */
+static unsigned int get_measured_perf(unsigned int cpu)
+{
+        union {
+                struct {
+                        u32 lo;
+                        u32 hi;
+                } split;
+                u64 whole;
+        } aperf_cur, mperf_cur;
+        cpumask_t saved_mask;
+        unsigned int perf_percent;
+        unsigned int retval;
+        saved_mask = current->cpus_allowed;
+        set_cpus_allowed(current, cpumask_of_cpu(cpu));
+        if (get_cpu() != cpu) {
+                /* We were not able to run on requested processor */
+                put_cpu();
+                return 0;
+        }
+        rdmsr(MSR_IA32_APERF, aperf_cur.split.lo, aperf_cur.split.hi);
+        rdmsr(MSR_IA32_MPERF, mperf_cur.split.lo, mperf_cur.split.hi);
+        wrmsr(MSR_IA32_APERF, 0,0);
+        wrmsr(MSR_IA32_MPERF, 0,0);
+#ifdef __i386__
+        /*
+         * We dont want to do 64 bit divide with 32 bit kernel
+         * Get an approximate value. Return failure in case we cannot get
+         * an approximate value.
+         */
+        if (unlikely(aperf_cur.split.hi || mperf_cur.split.hi)) {
+                int shift_count;
+                u32 h;
+                h = max_t(u32, aperf_cur.split.hi, mperf_cur.split.hi);
+                shift_count = fls(h);
+                aperf_cur.whole >>= shift_count;
+                mperf_cur.whole >>= shift_count;
+        }
+        if (((unsigned long)(-1) / 100) < aperf_cur.split.lo) {
+                int shift_count = 7;
+                aperf_cur.split.lo >>= shift_count;
+                mperf_cur.split.lo >>= shift_count;
+        }
+        if (aperf_cur.split.lo && mperf_cur.split.lo)
+                perf_percent = (aperf_cur.split.lo * 100) / mperf_cur.split.lo;
+        else
+                perf_percent = 0;
+#else
+        if (unlikely(((unsigned long)(-1) / 100) < aperf_cur.whole)) {
+                int shift_count = 7;
+                aperf_cur.whole >>= shift_count;
+                mperf_cur.whole >>= shift_count;
+        }
+        if (aperf_cur.whole && mperf_cur.whole)
+                perf_percent = (aperf_cur.whole * 100) / mperf_cur.whole;
+        else
+                perf_percent = 0;
+#endif
+        retval = drv_data[cpu]->max_freq * perf_percent / 100;
+        put_cpu();
+        set_cpus_allowed(current, saved_mask);
+        dprintk("cpu %d: performance percent %d\n", cpu, perf_percent);
+        return retval;
+}
+static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
+{
+        struct acpi_cpufreq_data *data = drv_data[cpu];
+        unsigned int freq;
+        dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
+        if (unlikely(data == NULL ||
+                     data->acpi_data == NULL || data->freq_table == NULL)) {
+                return 0;
+        }
+        freq = extract_freq(get_cur_val(cpumask_of_cpu(cpu)), data);
+        dprintk("cur freq = %u\n", freq);
+        return freq;
+}
+static unsigned int check_freqs(cpumask_t mask, unsigned int freq,
+                                struct acpi_cpufreq_data *data)
+{
+        unsigned int cur_freq;
+        unsigned int i;
+        for (i=0; i<100; i++) {
+                cur_freq = extract_freq(get_cur_val(mask), data);
+                if (cur_freq == freq)
+                        return 1;
+                udelay(10);
+        }
+        return 0;
+}
+static int acpi_cpufreq_target(struct cpufreq_policy *policy,
+                               unsigned int target_freq, unsigned int relation)
+{
+        struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+        struct acpi_processor_performance *perf;
+        struct cpufreq_freqs freqs;
+        cpumask_t online_policy_cpus;
+        struct drv_cmd cmd;
+        unsigned int next_state = 0; /* Index into freq_table */
+        unsigned int next_perf_state = 0; /* Index into perf table */
+        unsigned int i;
+        int result = 0;
+        dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
+        if (unlikely(data == NULL ||
+             data->acpi_data == NULL || data->freq_table == NULL)) {
+                return -ENODEV;
+        }
+        perf = data->acpi_data;
+        result = cpufreq_frequency_table_target(policy,
+                                                data->freq_table,
+                                                target_freq,
+                                                relation, &next_state);
+        if (unlikely(result))
+                return -ENODEV;
+#ifdef CONFIG_HOTPLUG_CPU
+        /* cpufreq holds the hotplug lock, so we are safe from here on */
+        cpus_and(online_policy_cpus, cpu_online_map, policy->cpus);
+#else
+        online_policy_cpus = policy->cpus;
+#endif
+        next_perf_state = data->freq_table[next_state].index;
+        if (perf->state == next_perf_state) {
+                if (unlikely(data->resume)) {
+                        dprintk("Called after resume, resetting to P%d\n",
+                                next_perf_state);
+                        data->resume = 0;
+                } else {
+                        dprintk("Already at target state (P%d)\n",
+                                next_perf_state);
+                        return 0;
+                }
+        }
+        switch (data->cpu_feature) {
+        case SYSTEM_INTEL_MSR_CAPABLE:
+                cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
+                cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
+                cmd.val = (u32) perf->states[next_perf_state].control;
+                break;
+        case SYSTEM_IO_CAPABLE:
+                cmd.type = SYSTEM_IO_CAPABLE;
+                cmd.addr.io.port = perf->control_register.address;
+                cmd.addr.io.bit_width = perf->control_register.bit_width;
+                cmd.val = (u32) perf->states[next_perf_state].control;
+                break;
+        default:
+                return -ENODEV;
+        }
+        cpus_clear(cmd.mask);
+        if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
+                cmd.mask = online_policy_cpus;
+        else
+                cpu_set(policy->cpu, cmd.mask);
+        freqs.old = perf->states[perf->state].core_frequency * 1000;
+        freqs.new = data->freq_table[next_state].frequency;
+        for_each_cpu_mask(i, cmd.mask) {
+                freqs.cpu = i;
+                cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        }
+        drv_write(&cmd);
+        if (acpi_pstate_strict) {
+                if (!check_freqs(cmd.mask, freqs.new, data)) {
+                        dprintk("acpi_cpufreq_target failed (%d)\n",
+                                policy->cpu);
+                        return -EAGAIN;
+                }
+        }
+        for_each_cpu_mask(i, cmd.mask) {
+                freqs.cpu = i;
+                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+        }
+        perf->state = next_perf_state;
+        return result;
+}
+static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
+{
+        struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+        dprintk("acpi_cpufreq_verify\n");
+        return cpufreq_frequency_table_verify(policy, data->freq_table);
+}
+static unsigned long
+acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
+{
+        struct acpi_processor_performance *perf = data->acpi_data;
+        if (cpu_khz) {
+                /* search the closest match to cpu_khz */
+                unsigned int i;
+                unsigned long freq;
+                unsigned long freqn = perf->states[0].core_frequency * 1000;
+                for (i=0; i<(perf->state_count-1); i++) {
+                        freq = freqn;
+                        freqn = perf->states[i+1].core_frequency * 1000;
+                        if ((2 * cpu_khz) > (freqn + freq)) {
+                                perf->state = i;
+                                return freq;
+                        }
+                }
+                perf->state = perf->state_count-1;
+                return freqn;
+        } else {
+                /* assume CPU is at P0... */
+                perf->state = 0;
+                return perf->states[0].core_frequency * 1000;
+        }
+}
+/*
+ * acpi_cpufreq_early_init - initialize ACPI P-States library
+ *
+ * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
+ * in order to determine correct frequency and voltage pairings. We can
+ * do _PDC and _PSD and find out the processor dependency for the
+ * actual init that will happen later...
+ */
+static int __init acpi_cpufreq_early_init(void)
+{
+        dprintk("acpi_cpufreq_early_init\n");
+        acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
+        if (!acpi_perf_data) {
+                dprintk("Memory allocation error for acpi_perf_data.\n");
+                return -ENOMEM;
+        }
+        /* Do initialization in ACPI core */
+        acpi_processor_preregister_performance(acpi_perf_data);
+        return 0;
+}
+#ifdef CONFIG_SMP
+/*
+ * Some BIOSes do SW_ANY coordination internally, either set it up in hw
+ * or do it in BIOS firmware and won't inform about it to OS. If not
+ * detected, this has a side effect of making CPU run at a different speed
+ * than OS intended it to run at. Detect it and handle it cleanly.
+ */
+static int bios_with_sw_any_bug;
+static int sw_any_bug_found(const struct dmi_system_id *d)
+{
+        bios_with_sw_any_bug = 1;
+        return 0;
+}
+static const struct dmi_system_id sw_any_bug_dmi_table[] = {
+        {
+                .callback = sw_any_bug_found,
+                .ident = "Supermicro Server X6DLP",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "080010"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
+                },
+        },
+        { }
+};
+#endif
+static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+        unsigned int i;
+        unsigned int valid_states = 0;
+        unsigned int cpu = policy->cpu;
+        struct acpi_cpufreq_data *data;
+        unsigned int result = 0;
+        struct cpuinfo_x86 *c = &cpu_data[policy->cpu];
+        struct acpi_processor_performance *perf;
+        dprintk("acpi_cpufreq_cpu_init\n");
+        data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
+        if (!data)
+                return -ENOMEM;
+        data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+        drv_data[cpu] = data;
+        if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
+                acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
+        result = acpi_processor_register_performance(data->acpi_data, cpu);
+        if (result)
+                goto err_free;
+        perf = data->acpi_data;
+        policy->shared_type = perf->shared_type;
+        /*
+         * Will let policy->cpus know about dependency only when software
+         * coordination is required.
+         */
+        if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
+            policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
+                policy->cpus = perf->shared_cpu_map;
+        }
+#ifdef CONFIG_SMP
+        dmi_check_system(sw_any_bug_dmi_table);
+        if (bios_with_sw_any_bug && cpus_weight(policy->cpus) == 1) {
+                policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+                policy->cpus = cpu_core_map[cpu];
+        }
+#endif
+        /* capability check */
+        if (perf->state_count <= 1) {
+                dprintk("No P-States\n");
+                result = -ENODEV;
+                goto err_unreg;
+        }
+        if (perf->control_register.space_id != perf->status_register.space_id) {
+                result = -ENODEV;
+                goto err_unreg;
+        }
+        switch (perf->control_register.space_id) {
+        case ACPI_ADR_SPACE_SYSTEM_IO:
+                dprintk("SYSTEM IO addr space\n");
+                data->cpu_feature = SYSTEM_IO_CAPABLE;
+                break;
+        case ACPI_ADR_SPACE_FIXED_HARDWARE:
+                dprintk("HARDWARE addr space\n");
+                if (!check_est_cpu(cpu)) {
+                        result = -ENODEV;
+                        goto err_unreg;
+                }
+                data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
+                break;
+        default:
+                dprintk("Unknown addr space %d\n",
+                        (u32) (perf->control_register.space_id));
+                result = -ENODEV;
+                goto err_unreg;
+        }
+        data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
+                    (perf->state_count+1), GFP_KERNEL);
+        if (!data->freq_table) {
+                result = -ENOMEM;
+                goto err_unreg;
+        }
+        /* detect transition latency */
+        policy->cpuinfo.transition_latency = 0;
+        for (i=0; i<perf->state_count; i++) {
+                if ((perf->states[i].transition_latency * 1000) >
+                    policy->cpuinfo.transition_latency)
+                        policy->cpuinfo.transition_latency =
+                            perf->states[i].transition_latency * 1000;
+        }
+        policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        data->max_freq = perf->states[0].core_frequency * 1000;
+        /* table init */
+        for (i=0; i<perf->state_count; i++) {
+                if (i>0 && perf->states[i].core_frequency >=
+                    data->freq_table[valid_states-1].frequency / 1000)
+                        continue;
+                data->freq_table[valid_states].index = i;
+                data->freq_table[valid_states].frequency =
+                    perf->states[i].core_frequency * 1000;
+                valid_states++;
+        }
+        data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
+        perf->state = 0;
+        result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
+        if (result)
+                goto err_freqfree;
+        switch (perf->control_register.space_id) {
+        case ACPI_ADR_SPACE_SYSTEM_IO:
+                /* Current speed is unknown and not detectable by IO port */
+                policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
+                break;
+        case ACPI_ADR_SPACE_FIXED_HARDWARE:
+                acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
+                policy->cur = get_cur_freq_on_cpu(cpu);
+                break;
+        default:
+                break;
+        }
+        /* notify BIOS that we exist */
+        acpi_processor_notify_smm(THIS_MODULE);
+        /* Check for APERF/MPERF support in hardware */
+        if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) {
+                unsigned int ecx;
+                ecx = cpuid_ecx(6);
+                if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
+                        acpi_cpufreq_driver.getavg = get_measured_perf;
+        }
+        dprintk("CPU%u - ACPI performance management activated.\n", cpu);
+        for (i = 0; i < perf->state_count; i++)
+                dprintk("     %cP%d: %d MHz, %d mW, %d uS\n",
+                        (i == perf->state ? '*' : ' '), i,
+                        (u32) perf->states[i].core_frequency,
+                        (u32) perf->states[i].power,
+                        (u32) perf->states[i].transition_latency);
+        cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
+        /*
+         * the first call to ->target() should result in us actually
+         * writing something to the appropriate registers.
+         */
+        data->resume = 1;
+        return result;
+err_freqfree:
+        kfree(data->freq_table);
+err_unreg:
+        acpi_processor_unregister_performance(perf, cpu);
+err_free:
+        kfree(data);
+        drv_data[cpu] = NULL;
+        return result;
+}
+static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+        struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+        dprintk("acpi_cpufreq_cpu_exit\n");
+        if (data) {
+                cpufreq_frequency_table_put_attr(policy->cpu);
+                drv_data[policy->cpu] = NULL;
+                acpi_processor_unregister_performance(data->acpi_data,
+                                                      policy->cpu);
+                kfree(data);
+        }
+        return 0;
+}
+static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
+{
+        struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+        dprintk("acpi_cpufreq_resume\n");
+        data->resume = 1;
+        return 0;
+}
+static struct freq_attr *acpi_cpufreq_attr[] = {
+        &cpufreq_freq_attr_scaling_available_freqs,
+        NULL,
+};
+static struct cpufreq_driver acpi_cpufreq_driver = {
+        .verify = acpi_cpufreq_verify,
+        .target = acpi_cpufreq_target,
+        .init = acpi_cpufreq_cpu_init,
+        .exit = acpi_cpufreq_cpu_exit,
+        .resume = acpi_cpufreq_resume,
+        .name = "acpi-cpufreq",
+        .owner = THIS_MODULE,
+        .attr = acpi_cpufreq_attr,
+};
+static int __init acpi_cpufreq_init(void)
+{
+        int ret;
+        dprintk("acpi_cpufreq_init\n");
+        ret = acpi_cpufreq_early_init();
+        if (ret)
+                return ret;
+        return cpufreq_register_driver(&acpi_cpufreq_driver);
+}
+static void __exit acpi_cpufreq_exit(void)
+{
+        dprintk("acpi_cpufreq_exit\n");
+        cpufreq_unregister_driver(&acpi_cpufreq_driver);
+        free_percpu(acpi_perf_data);
+        return;
+}
+module_param(acpi_pstate_strict, uint, 0644);
+MODULE_PARM_DESC(acpi_pstate_strict,
+        "value 0 or non-zero. non-zero -> strict ACPI checks are "
+        "performed during frequency changes.");
+late_initcall(acpi_cpufreq_init);
+module_exit(acpi_cpufreq_exit);
+MODULE_ALIAS("acpi");
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
new file mode 100644
index 000000000000..66acd5039918
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -0,0 +1,441 @@
+/*
+ * (C) 2004-2006  Sebastian Witt <se.witt@gmx.net>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *  Based upon reverse engineered information
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#define NFORCE2_XTAL 25
+#define NFORCE2_BOOTFSB 0x48
+#define NFORCE2_PLLENABLE 0xa8
+#define NFORCE2_PLLREG 0xa4
+#define NFORCE2_PLLADR 0xa0
+#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
+#define NFORCE2_MIN_FSB 50
+#define NFORCE2_SAFE_DISTANCE 50
+/* Delay in ms between FSB changes */
+//#define NFORCE2_DELAY 10
+/* nforce2_chipset:
+ * FSB is changed using the chipset
+ */
+static struct pci_dev *nforce2_chipset_dev;
+/* fid:
+ * multiplier * 10
+ */
+static int fid = 0;
+/* min_fsb, max_fsb:
+ * minimum and maximum FSB (= FSB at boot time)
+ */
+static int min_fsb = 0;
+static int max_fsb = 0;
+MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
+MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
+MODULE_LICENSE("GPL");
+module_param(fid, int, 0444);
+module_param(min_fsb, int, 0444);
+MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
+MODULE_PARM_DESC(min_fsb,
+                 "Minimum FSB to use, if not defined: current FSB - 50");
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg)
+/**
+ * nforce2_calc_fsb - calculate FSB
+ * @pll: PLL value
+ *
+ *   Calculates FSB from PLL value
+ */
+static int nforce2_calc_fsb(int pll)
+{
+        unsigned char mul, div;
+        mul = (pll >> 8) & 0xff;
+        div = pll & 0xff;
+        if (div > 0)
+                return NFORCE2_XTAL * mul / div;
+        return 0;
+}
+/**
+ * nforce2_calc_pll - calculate PLL value
+ * @fsb: FSB
+ *
+ *   Calculate PLL value for given FSB
+ */
+static int nforce2_calc_pll(unsigned int fsb)
+{
+        unsigned char xmul, xdiv;
+        unsigned char mul = 0, div = 0;
+        int tried = 0;
+        /* Try to calculate multiplier and divider up to 4 times */
+        while (((mul == 0) || (div == 0)) && (tried <= 3)) {
+                for (xdiv = 2; xdiv <= 0x80; xdiv++)
+                        for (xmul = 1; xmul <= 0xfe; xmul++)
+                                if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
+                                    fsb + tried) {
+                                        mul = xmul;
+                                        div = xdiv;
+                                }
+                tried++;
+        }
+        if ((mul == 0) || (div == 0))
+                return -1;
+        return NFORCE2_PLL(mul, div);
+}
+/**
+ * nforce2_write_pll - write PLL value to chipset
+ * @pll: PLL value
+ *
+ *   Writes new FSB PLL value to chipset
+ */
+static void nforce2_write_pll(int pll)
+{
+        int temp;
+        /* Set the pll addr. to 0x00 */
+        pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLADR, 0);
+        /* Now write the value in all 64 registers */
+        for (temp = 0; temp <= 0x3f; temp++)
+                pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, pll);
+        return;
+}
+/**
+ * nforce2_fsb_read - Read FSB
+ *
+ *   Read FSB from chipset
+ *   If bootfsb != 0, return FSB at boot-time
+ */
+static unsigned int nforce2_fsb_read(int bootfsb)
+{
+        struct pci_dev *nforce2_sub5;
+        u32 fsb, temp = 0;
+        /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
+        nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
+                                                0x01EF,PCI_ANY_ID,PCI_ANY_ID,NULL);
+        if (!nforce2_sub5)
+                return 0;
+        pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
+        fsb /= 1000000;
+        /* Check if PLL register is already set */
+        pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp);
+        if(bootfsb || !temp)
+                return fsb;
+                
+        /* Use PLL register FSB value */
+        pci_read_config_dword(nforce2_chipset_dev,NFORCE2_PLLREG, &temp);
+        fsb = nforce2_calc_fsb(temp);
+        return fsb;
+}
+/**
+ * nforce2_set_fsb - set new FSB
+ * @fsb: New FSB
+ *
+ *   Sets new FSB
+ */
+static int nforce2_set_fsb(unsigned int fsb)
+{
+        u32 temp = 0;
+        unsigned int tfsb;
+        int diff;
+        int pll = 0;
+        if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
+                printk(KERN_ERR "cpufreq: FSB %d is out of range!\n", fsb);
+                return -EINVAL;
+        }
+        tfsb = nforce2_fsb_read(0);
+        if (!tfsb) {
+                printk(KERN_ERR "cpufreq: Error while reading the FSB\n");
+                return -EINVAL;
+        }
+        /* First write? Then set actual value */
+        pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp);
+        if (!temp) {
+                pll = nforce2_calc_pll(tfsb);
+                if (pll < 0)
+                        return -EINVAL;
+                nforce2_write_pll(pll);
+        }
+        /* Enable write access */
+        temp = 0x01;
+        pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8)temp);
+        diff = tfsb - fsb;
+        if (!diff)
+                return 0;
+        while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
+                if (diff < 0)
+                        tfsb++;
+                else
+                        tfsb--;
+                /* Calculate the PLL reg. value */
+                if ((pll = nforce2_calc_pll(tfsb)) == -1)
+                        return -EINVAL;
+                nforce2_write_pll(pll);
+#ifdef NFORCE2_DELAY
+                mdelay(NFORCE2_DELAY);
+#endif
+        }
+        temp = 0x40;
+        pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLADR, (u8)temp);
+        return 0;
+}
+/**
+ * nforce2_get - get the CPU frequency
+ * @cpu: CPU number
+ *
+ * Returns the CPU frequency
+ */
+static unsigned int nforce2_get(unsigned int cpu)
+{
+        if (cpu)
+                return 0;
+        return nforce2_fsb_read(0) * fid * 100;
+}
+/**
+ * nforce2_target - set a new CPUFreq policy
+ * @policy: new policy
+ * @target_freq: the target frequency
+ * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
+ *
+ * Sets a new CPUFreq policy.
+ */
+static int nforce2_target(struct cpufreq_policy *policy,
+                          unsigned int target_freq, unsigned int relation)
+{
+//        unsigned long         flags;
+        struct cpufreq_freqs freqs;
+        unsigned int target_fsb;
+        if ((target_freq > policy->max) || (target_freq < policy->min))
+                return -EINVAL;
+        target_fsb = target_freq / (fid * 100);
+        freqs.old = nforce2_get(policy->cpu);
+        freqs.new = target_fsb * fid * 100;
+        freqs.cpu = 0;          /* Only one CPU on nForce2 plattforms */
+        if (freqs.old == freqs.new)
+                return 0;
+        dprintk("Old CPU frequency %d kHz, new %d kHz\n",
+               freqs.old, freqs.new);
+        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        /* Disable IRQs */
+        //local_irq_save(flags);
+        if (nforce2_set_fsb(target_fsb) < 0)
+                printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n",
+                       target_fsb);
+        else
+                dprintk("Changed FSB successfully to %d\n",
+                       target_fsb);
+        /* Enable IRQs */
+        //local_irq_restore(flags);
+        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+        return 0;
+}
+/**
+ * nforce2_verify - verifies a new CPUFreq policy
+ * @policy: new policy
+ */
+static int nforce2_verify(struct cpufreq_policy *policy)
+{
+        unsigned int fsb_pol_max;
+        fsb_pol_max = policy->max / (fid * 100);
+        if (policy->min < (fsb_pol_max * fid * 100))
+                policy->max = (fsb_pol_max + 1) * fid * 100;
+        cpufreq_verify_within_limits(policy,
+                                     policy->cpuinfo.min_freq,
+                                     policy->cpuinfo.max_freq);
+        return 0;
+}
+static int nforce2_cpu_init(struct cpufreq_policy *policy)
+{
+        unsigned int fsb;
+        unsigned int rfid;
+        /* capability check */
+        if (policy->cpu != 0)
+                return -ENODEV;
+        /* Get current FSB */
+        fsb = nforce2_fsb_read(0);
+        if (!fsb)
+                return -EIO;
+        /* FIX: Get FID from CPU */
+        if (!fid) {
+                if (!cpu_khz) {
+                        printk(KERN_WARNING
+                               "cpufreq: cpu_khz not set, can't calculate multiplier!\n");
+                        return -ENODEV;
+                }
+                fid = cpu_khz / (fsb * 100);
+                rfid = fid % 5;
+                if (rfid) {
+                        if (rfid > 2)
+                                fid += 5 - rfid;
+                        else
+                                fid -= rfid;
+                }
+        }
+        printk(KERN_INFO "cpufreq: FSB currently at %i MHz, FID %d.%d\n", fsb,
+               fid / 10, fid % 10);
+        /* Set maximum FSB to FSB at boot time */
+        max_fsb = nforce2_fsb_read(1);
+        if(!max_fsb)
+                return -EIO;
+        if (!min_fsb)
+                min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
+        if (min_fsb < NFORCE2_MIN_FSB)
+                min_fsb = NFORCE2_MIN_FSB;
+        /* cpuinfo and default policy values */
+        policy->cpuinfo.min_freq = min_fsb * fid * 100;
+        policy->cpuinfo.max_freq = max_fsb * fid * 100;
+        policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+        policy->cur = nforce2_get(policy->cpu);
+        policy->min = policy->cpuinfo.min_freq;
+        policy->max = policy->cpuinfo.max_freq;
+        policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        return 0;
+}
+static int nforce2_cpu_exit(struct cpufreq_policy *policy)
+{
+        return 0;
+}
+static struct cpufreq_driver nforce2_driver = {
+        .name = "nforce2",
+        .verify = nforce2_verify,
+        .target = nforce2_target,
+        .get = nforce2_get,
+        .init = nforce2_cpu_init,
+        .exit = nforce2_cpu_exit,
+        .owner = THIS_MODULE,
+};
+/**
+ * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
+ *
+ * Detects nForce2 A2 and C1 stepping
+ *
+ */
+static unsigned int nforce2_detect_chipset(void)
+{
+        nforce2_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
+                                        PCI_DEVICE_ID_NVIDIA_NFORCE2,
+                                        PCI_ANY_ID, PCI_ANY_ID, NULL);
+        if (nforce2_chipset_dev == NULL)
+                return -ENODEV;
+        printk(KERN_INFO "cpufreq: Detected nForce2 chipset revision %X\n",
+               nforce2_chipset_dev->revision);
+        printk(KERN_INFO
+               "cpufreq: FSB changing is maybe unstable and can lead to crashes and data loss.\n");
+        return 0;
+}
+/**
+ * nforce2_init - initializes the nForce2 CPUFreq driver
+ *
+ * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
+ * devices, -EINVAL on problems during initiatization, and zero on
+ * success.
+ */
+static int __init nforce2_init(void)
+{
+        /* TODO: do we need to detect the processor? */
+        /* detect chipset */
+        if (nforce2_detect_chipset()) {
+                printk(KERN_ERR "cpufreq: No nForce2 chipset.\n");
+                return -ENODEV;
+        }
+        return cpufreq_register_driver(&nforce2_driver);
+}
+/**
+ * nforce2_exit - unregisters cpufreq module
+ *
+ *   Unregisters nForce2 FSB change support.
+ */
+static void __exit nforce2_exit(void)
+{
+        cpufreq_unregister_driver(&nforce2_driver);
+}
+module_init(nforce2_init);
+module_exit(nforce2_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
new file mode 100644
index 000000000000..f43d98e11cc7
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
@@ -0,0 +1,334 @@
+/*
+ *  Based on documentation provided by Dave Jones. Thanks!
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/ioport.h>
+#include <linux/slab.h>
+#include <asm/msr.h>
+#include <asm/tsc.h>
+#include <asm/timex.h>
+#include <asm/io.h>
+#include <asm/delay.h>
+#define EPS_BRAND_C7M   0
+#define EPS_BRAND_C7    1
+#define EPS_BRAND_EDEN  2
+#define EPS_BRAND_C3    3
+struct eps_cpu_data {
+        u32 fsb;
+        struct cpufreq_frequency_table freq_table[];
+};
+static struct eps_cpu_data *eps_cpu[NR_CPUS];
+static unsigned int eps_get(unsigned int cpu)
+{
+        struct eps_cpu_data *centaur;
+        u32 lo, hi;
+        if (cpu)
+                return 0;
+        centaur = eps_cpu[cpu];
+        if (centaur == NULL)
+                return 0;
+        /* Return current frequency */
+        rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+        return centaur->fsb * ((lo >> 8) & 0xff);
+}
+static int eps_set_state(struct eps_cpu_data *centaur,
+                         unsigned int cpu,
+                         u32 dest_state)
+{
+        struct cpufreq_freqs freqs;
+        u32 lo, hi;
+        int err = 0;
+        int i;
+        freqs.old = eps_get(cpu);
+        freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
+        freqs.cpu = cpu;
+        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        /* Wait while CPU is busy */
+        rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+        i = 0;
+        while (lo & ((1 << 16) | (1 << 17))) {
+                udelay(16);
+                rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+                i++;
+                if (unlikely(i > 64)) {
+                        err = -ENODEV;
+                        goto postchange;
+                }
+        }
+        /* Set new multiplier and voltage */
+        wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
+        /* Wait until transition end */
+        i = 0;
+        do {
+                udelay(16);
+                rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+                i++;
+                if (unlikely(i > 64)) {
+                        err = -ENODEV;
+                        goto postchange;
+                }
+        } while (lo & ((1 << 16) | (1 << 17)));
+        /* Return current frequency */
+postchange:
+        rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+        freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
+        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+        return err;
+}
+static int eps_target(struct cpufreq_policy *policy,
+                               unsigned int target_freq,
+                               unsigned int relation)
+{
+        struct eps_cpu_data *centaur;
+        unsigned int newstate = 0;
+        unsigned int cpu = policy->cpu;
+        unsigned int dest_state;
+        int ret;
+        if (unlikely(eps_cpu[cpu] == NULL))
+                return -ENODEV;
+        centaur = eps_cpu[cpu];
+        if (unlikely(cpufreq_frequency_table_target(policy,
+                        &eps_cpu[cpu]->freq_table[0],
+                        target_freq,
+                        relation,
+                        &newstate))) {
+                return -EINVAL;
+        }
+        /* Make frequency transition */
+        dest_state = centaur->freq_table[newstate].index & 0xffff;
+        ret = eps_set_state(centaur, cpu, dest_state);
+        if (ret)
+                printk(KERN_ERR "eps: Timeout!\n");
+        return ret;
+}
+static int eps_verify(struct cpufreq_policy *policy)
+{
+        return cpufreq_frequency_table_verify(policy,
+                        &eps_cpu[policy->cpu]->freq_table[0]);
+}
+static int eps_cpu_init(struct cpufreq_policy *policy)
+{
+        unsigned int i;
+        u32 lo, hi;
+        u64 val;
+        u8 current_multiplier, current_voltage;
+        u8 max_multiplier, max_voltage;
+        u8 min_multiplier, min_voltage;
+        u8 brand;
+        u32 fsb;
+        struct eps_cpu_data *centaur;
+        struct cpufreq_frequency_table *f_table;
+        int k, step, voltage;
+        int ret;
+        int states;
+        if (policy->cpu != 0)
+                return -ENODEV;
+        /* Check brand */
+        printk("eps: Detected VIA ");
+        rdmsr(0x1153, lo, hi);
+        brand = (((lo >> 2) ^ lo) >> 18) & 3;
+        switch(brand) {
+        case EPS_BRAND_C7M:
+                printk("C7-M\n");
+                break;
+        case EPS_BRAND_C7:
+                printk("C7\n");
+                break;
+        case EPS_BRAND_EDEN:
+                printk("Eden\n");
+                break;
+        case EPS_BRAND_C3:
+                printk("C3\n");
+                return -ENODEV;
+                break;
+        }
+        /* Enable Enhanced PowerSaver */
+        rdmsrl(MSR_IA32_MISC_ENABLE, val);
+        if (!(val & 1 << 16)) {
+                val |= 1 << 16;
+                wrmsrl(MSR_IA32_MISC_ENABLE, val);
+                /* Can be locked at 0 */
+                rdmsrl(MSR_IA32_MISC_ENABLE, val);
+                if (!(val & 1 << 16)) {
+                        printk("eps: Can't enable Enhanced PowerSaver\n");
+                        return -ENODEV;
+                }
+        }
+        /* Print voltage and multiplier */
+        rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+        current_voltage = lo & 0xff;
+        printk("eps: Current voltage = %dmV\n", current_voltage * 16 + 700);
+        current_multiplier = (lo >> 8) & 0xff;
+        printk("eps: Current multiplier = %d\n", current_multiplier);
+        /* Print limits */
+        max_voltage = hi & 0xff;
+        printk("eps: Highest voltage = %dmV\n", max_voltage * 16 + 700);
+        max_multiplier = (hi >> 8) & 0xff;
+        printk("eps: Highest multiplier = %d\n", max_multiplier);
+        min_voltage = (hi >> 16) & 0xff;
+        printk("eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700);
+        min_multiplier = (hi >> 24) & 0xff;
+        printk("eps: Lowest multiplier = %d\n", min_multiplier);
+        /* Sanity checks */
+        if (current_multiplier == 0 || max_multiplier == 0
+            || min_multiplier == 0)
+                return -EINVAL;
+        if (current_multiplier > max_multiplier
+            || max_multiplier <= min_multiplier)
+                return -EINVAL;
+        if (current_voltage > 0x1c || max_voltage > 0x1c)
+                return -EINVAL;
+        if (max_voltage < min_voltage)
+                return -EINVAL;
+        /* Calc FSB speed */
+        fsb = cpu_khz / current_multiplier;
+        /* Calc number of p-states supported */
+        if (brand == EPS_BRAND_C7M)
+                states = max_multiplier - min_multiplier + 1;
+        else
+                states = 2;
+        /* Allocate private data and frequency table for current cpu */
+        centaur = kzalloc(sizeof(struct eps_cpu_data)
+                    + (states + 1) * sizeof(struct cpufreq_frequency_table),
+                    GFP_KERNEL);
+        if (!centaur)
+                return -ENOMEM;
+        eps_cpu[0] = centaur;
+        /* Copy basic values */
+        centaur->fsb = fsb;
+        /* Fill frequency and MSR value table */
+        f_table = &centaur->freq_table[0];
+        if (brand != EPS_BRAND_C7M) {
+                f_table[0].frequency = fsb * min_multiplier;
+                f_table[0].index = (min_multiplier << 8) | min_voltage;
+                f_table[1].frequency = fsb * max_multiplier;
+                f_table[1].index = (max_multiplier << 8) | max_voltage;
+                f_table[2].frequency = CPUFREQ_TABLE_END;
+        } else {
+                k = 0;
+                step = ((max_voltage - min_voltage) * 256)
+                        / (max_multiplier - min_multiplier);
+                for (i = min_multiplier; i <= max_multiplier; i++) {
+                        voltage = (k * step) / 256 + min_voltage;
+                        f_table[k].frequency = fsb * i;
+                        f_table[k].index = (i << 8) | voltage;
+                        k++;
+                }
+                f_table[k].frequency = CPUFREQ_TABLE_END;
+        }
+        policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
+        policy->cur = fsb * current_multiplier;
+        ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
+        if (ret) {
+                kfree(centaur);
+                return ret;
+        }
+        cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
+        return 0;
+}
+static int eps_cpu_exit(struct cpufreq_policy *policy)
+{
+        unsigned int cpu = policy->cpu;
+        struct eps_cpu_data *centaur;
+        u32 lo, hi;
+        if (eps_cpu[cpu] == NULL)
+                return -ENODEV;
+        centaur = eps_cpu[cpu];
+        /* Get max frequency */
+        rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+        /* Set max frequency */
+        eps_set_state(centaur, cpu, hi & 0xffff);
+        /* Bye */
+        cpufreq_frequency_table_put_attr(policy->cpu);
+        kfree(eps_cpu[cpu]);
+        eps_cpu[cpu] = NULL;
+        return 0;
+}
+static struct freq_attr* eps_attr[] = {
+        &cpufreq_freq_attr_scaling_available_freqs,
+        NULL,
+};
+static struct cpufreq_driver eps_driver = {
+        .verify         = eps_verify,
+        .target         = eps_target,
+        .init           = eps_cpu_init,
+        .exit           = eps_cpu_exit,
+        .get            = eps_get,
+        .name           = "e_powersaver",
+        .owner          = THIS_MODULE,
+        .attr           = eps_attr,
+};
+static int __init eps_init(void)
+{
+        struct cpuinfo_x86 *c = cpu_data;
+        /* This driver will work only on Centaur C7 processors with
+         * Enhanced SpeedStep/PowerSaver registers */
+        if (c->x86_vendor != X86_VENDOR_CENTAUR
+            || c->x86 != 6 || c->x86_model != 10)
+                return -ENODEV;
+        if (!cpu_has(c, X86_FEATURE_EST))
+                return -ENODEV;
+        if (cpufreq_register_driver(&eps_driver))
+                return -EINVAL;
+        return 0;
+}
+static void __exit eps_exit(void)
+{
+        cpufreq_unregister_driver(&eps_driver);
+}
+MODULE_AUTHOR("Rafa� Bilski <rafalbilski@interia.pl>");
+MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
+MODULE_LICENSE("GPL");
+module_init(eps_init);
+module_exit(eps_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
new file mode 100644
index 000000000000..f317276afa7a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -0,0 +1,309 @@
+/*
+ *      elanfreq:       cpufreq driver for the AMD ELAN family
+ *
+ *      (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
+ *
+ *      Parts of this code are (c) Sven Geggus <sven@geggus.net>
+ *
+ *      All Rights Reserved.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *      2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/cpufreq.h>
+#include <asm/msr.h>
+#include <asm/timex.h>
+#include <asm/io.h>
+#define REG_CSCIR 0x22          /* Chip Setup and Control Index Register    */
+#define REG_CSCDR 0x23          /* Chip Setup and Control Data  Register    */
+/* Module parameter */
+static int max_freq;
+struct s_elan_multiplier {
+        int clock;              /* frequency in kHz                         */
+        int val40h;             /* PMU Force Mode register                  */
+        int val80h;             /* CPU Clock Speed Register                 */
+};
+/*
+ * It is important that the frequencies
+ * are listed in ascending order here!
+ */
+struct s_elan_multiplier elan_multiplier[] = {
+        {1000,  0x02,   0x18},
+        {2000,  0x02,   0x10},
+        {4000,  0x02,   0x08},
+        {8000,  0x00,   0x00},
+        {16000, 0x00,   0x02},
+        {33000, 0x00,   0x04},
+        {66000, 0x01,   0x04},
+        {99000, 0x01,   0x05}
+};
+static struct cpufreq_frequency_table elanfreq_table[] = {
+        {0,     1000},
+        {1,     2000},
+        {2,     4000},
+        {3,     8000},
+        {4,     16000},
+        {5,     33000},
+        {6,     66000},
+        {7,     99000},
+        {0,     CPUFREQ_TABLE_END},
+};
+/**
+ *      elanfreq_get_cpu_frequency: determine current cpu speed
+ *
+ *      Finds out at which frequency the CPU of the Elan SOC runs
+ *      at the moment. Frequencies from 1 to 33 MHz are generated
+ *      the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
+ *      and have the rest of the chip running with 33 MHz.
+ */
+static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
+{
+        u8 clockspeed_reg;    /* Clock Speed Register */
+        local_irq_disable();
+        outb_p(0x80,REG_CSCIR);
+        clockspeed_reg = inb_p(REG_CSCDR);
+        local_irq_enable();
+        if ((clockspeed_reg & 0xE0) == 0xE0)
+                return 0;
+        /* Are we in CPU clock multiplied mode (66/99 MHz)? */
+        if ((clockspeed_reg & 0xE0) == 0xC0) {
+                if ((clockspeed_reg & 0x01) == 0)
+                        return 66000;
+                else
+                        return 99000;
+        }
+        /* 33 MHz is not 32 MHz... */
+        if ((clockspeed_reg & 0xE0)==0xA0)
+                return 33000;
+        return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000);
+}
+/**
+ *      elanfreq_set_cpu_frequency: Change the CPU core frequency
+ *      @cpu: cpu number
+ *      @freq: frequency in kHz
+ *
+ *      This function takes a frequency value and changes the CPU frequency
+ *      according to this. Note that the frequency has to be checked by
+ *      elanfreq_validatespeed() for correctness!
+ *
+ *      There is no return value.
+ */
+static void elanfreq_set_cpu_state (unsigned int state)
+{
+        struct cpufreq_freqs    freqs;
+        freqs.old = elanfreq_get_cpu_frequency(0);
+        freqs.new = elan_multiplier[state].clock;
+        freqs.cpu = 0; /* elanfreq.c is UP only driver */
+        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
+                        elan_multiplier[state].clock);
+        /*
+         * Access to the Elan's internal registers is indexed via
+         * 0x22: Chip Setup & Control Register Index Register (CSCI)
+         * 0x23: Chip Setup & Control Register Data  Register (CSCD)
+         *
+         */
+        /*
+         * 0x40 is the Power Management Unit's Force Mode Register.
+         * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
+         */
+        local_irq_disable();
+        outb_p(0x40,REG_CSCIR);         /* Disable hyperspeed mode */
+        outb_p(0x00,REG_CSCDR);
+        local_irq_enable();             /* wait till internal pipelines and */
+        udelay(1000);                   /* buffers have cleaned up          */
+        local_irq_disable();
+        /* now, set the CPU clock speed register (0x80) */
+        outb_p(0x80,REG_CSCIR);
+        outb_p(elan_multiplier[state].val80h,REG_CSCDR);
+        /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
+        outb_p(0x40,REG_CSCIR);
+        outb_p(elan_multiplier[state].val40h,REG_CSCDR);
+        udelay(10000);
+        local_irq_enable();
+        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+};
+/**
+ *      elanfreq_validatespeed: test if frequency range is valid
+ *      @policy: the policy to validate
+ *
+ *      This function checks if a given frequency range in kHz is valid
+ *      for the hardware supported by the driver.
+ */
+static int elanfreq_verify (struct cpufreq_policy *policy)
+{
+        return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
+}
+static int elanfreq_target (struct cpufreq_policy *policy,
+                            unsigned int target_freq,
+                            unsigned int relation)
+{
+        unsigned int newstate = 0;
+        if (cpufreq_frequency_table_target(policy, &elanfreq_table[0], target_freq, relation, &newstate))
+                return -EINVAL;
+        elanfreq_set_cpu_state(newstate);
+        return 0;
+}
+/*
+ *      Module init and exit code
+ */
+static int elanfreq_cpu_init(struct cpufreq_policy *policy)
+{
+        struct cpuinfo_x86 *c = cpu_data;
+        unsigned int i;
+        int result;
+        /* capability check */
+        if ((c->x86_vendor != X86_VENDOR_AMD) ||
+            (c->x86 != 4) || (c->x86_model!=10))
+                return -ENODEV;
+        /* max freq */
+        if (!max_freq)
+                max_freq = elanfreq_get_cpu_frequency(0);
+        /* table init */
+        for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
+                if (elanfreq_table[i].frequency > max_freq)
+                        elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+        }
+        /* cpuinfo and default policy values */
+        policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+        policy->cur = elanfreq_get_cpu_frequency(0);
+        result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
+        if (result)
+                return (result);
+        cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
+        return 0;
+}
+static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
+{
+        cpufreq_frequency_table_put_attr(policy->cpu);
+        return 0;
+}
+#ifndef MODULE
+/**
+ * elanfreq_setup - elanfreq command line parameter parsing
+ *
+ * elanfreq command line parameter.  Use:
+ *  elanfreq=66000
+ * to set the maximum CPU frequency to 66 MHz. Note that in
+ * case you do not give this boot parameter, the maximum
+ * frequency will fall back to _current_ CPU frequency which
+ * might be lower. If you build this as a module, use the
+ * max_freq module parameter instead.
+ */
+static int __init elanfreq_setup(char *str)
+{
+        max_freq = simple_strtoul(str, &str, 0);
+        printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
+        return 1;
+}
+__setup("elanfreq=", elanfreq_setup);
+#endif
+static struct freq_attr* elanfreq_attr[] = {
+        &cpufreq_freq_attr_scaling_available_freqs,
+        NULL,
+};
+static struct cpufreq_driver elanfreq_driver = {
+        .get            = elanfreq_get_cpu_frequency,
+        .verify         = elanfreq_verify,
+        .target         = elanfreq_target,
+        .init           = elanfreq_cpu_init,
+        .exit           = elanfreq_cpu_exit,
+        .name           = "elanfreq",
+        .owner          = THIS_MODULE,
+        .attr           = elanfreq_attr,
+};
+static int __init elanfreq_init(void)
+{
+        struct cpuinfo_x86 *c = cpu_data;
+        /* Test if we have the right hardware */
+        if ((c->x86_vendor != X86_VENDOR_AMD) ||
+                (c->x86 != 4) || (c->x86_model!=10)) {
+                printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
+                return -ENODEV;
+        }
+        return cpufreq_register_driver(&elanfreq_driver);
+}
+static void __exit elanfreq_exit(void)
+{
+        cpufreq_unregister_driver(&elanfreq_driver);
+}
+module_param (max_freq, int, 0444);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
+MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
+module_init(elanfreq_init);
+module_exit(elanfreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
new file mode 100644
index 000000000000..461dabc4e495
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -0,0 +1,495 @@
+/*
+ *      Cyrix MediaGX and NatSemi Geode Suspend Modulation
+ *      (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
+ *      (C) 2002 Hiroshi Miura   <miura@da-cha.org>
+ *      All Rights Reserved
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      version 2 as published by the Free Software Foundation
+ *
+ *      The author(s) of this software shall not be held liable for damages
+ *      of any nature resulting due to the use of this software. This
+ *      software is provided AS-IS with no warranties.
+ *
+ * Theoritical note:
+ *
+ *      (see Geode(tm) CS5530 manual (rev.4.1) page.56)
+ *
+ *      CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
+ *      are based on Suspend Moduration.
+ *
+ *      Suspend Modulation works by asserting and de-asserting the SUSP# pin
+ *      to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
+ *      the CPU enters an idle state. GX1 stops its core clock when SUSP# is
+ *      asserted then power consumption is reduced.
+ *
+ *      Suspend Modulation's OFF/ON duration are configurable
+ *      with 'Suspend Modulation OFF Count Register'
+ *      and 'Suspend Modulation ON Count Register'.
+ *      These registers are 8bit counters that represent the number of
+ *      32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
+ *      to the processor.
+ *
+ *      These counters define a ratio which is the effective frequency
+ *      of operation of the system.
+ *
+ *                             OFF Count
+ *      F_eff = Fgx * ----------------------
+ *                      OFF Count + ON Count
+ *
+ *      0 <= On Count, Off Count <= 255
+ *
+ *      From these limits, we can get register values
+ *
+ *      off_duration + on_duration <= MAX_DURATION
+ *      on_duration = off_duration * (stock_freq - freq) / freq
+ *
+ *      off_duration  =  (freq * DURATION) / stock_freq
+ *      on_duration = DURATION - off_duration
+ *
+ *
+ *---------------------------------------------------------------------------
+ *
+ * ChangeLog:
+ *      Dec. 12, 2003   Hiroshi Miura <miura@da-cha.org>
+ *              - fix on/off register mistake
+ *              - fix cpu_khz calc when it stops cpu modulation.
+ *
+ *      Dec. 11, 2002   Hiroshi Miura <miura@da-cha.org>
+ *              - rewrite for Cyrix MediaGX Cx5510/5520 and
+ *                NatSemi Geode Cs5530(A).
+ *
+ *      Jul. ??, 2002  Zwane Mwaikambo <zwane@commfireservices.com>
+ *              - cs5530_mod patch for 2.4.19-rc1.
+ *
+ *---------------------------------------------------------------------------
+ *
+ * Todo
+ *      Test on machines with 5510, 5530, 5530A
+ */
+/************************************************************************
+ *                      Suspend Modulation - Definitions                *
+ ************************************************************************/
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/cpufreq.h>
+#include <linux/pci.h>
+#include <asm/processor-cyrix.h>
+#include <asm/errno.h>
+/* PCI config registers, all at F0 */
+#define PCI_PMER1       0x80    /* power management enable register 1 */
+#define PCI_PMER2       0x81    /* power management enable register 2 */
+#define PCI_PMER3       0x82    /* power management enable register 3 */
+#define PCI_IRQTC       0x8c    /* irq speedup timer counter register:typical 2 to 4ms */
+#define PCI_VIDTC       0x8d    /* video speedup timer counter register: typical 50 to 100ms */
+#define PCI_MODOFF      0x94    /* suspend modulation OFF counter register, 1 = 32us */
+#define PCI_MODON       0x95    /* suspend modulation ON counter register */
+#define PCI_SUSCFG      0x96    /* suspend configuration register */
+/* PMER1 bits */
+#define GPM             (1<<0)  /* global power management */
+#define GIT             (1<<1)  /* globally enable PM device idle timers */
+#define GTR             (1<<2)  /* globally enable IO traps */
+#define IRQ_SPDUP       (1<<3)  /* disable clock throttle during interrupt handling */
+#define VID_SPDUP       (1<<4)  /* disable clock throttle during vga video handling */
+/* SUSCFG bits */
+#define SUSMOD          (1<<0)  /* enable/disable suspend modulation */
+/* the belows support only with cs5530 (after rev.1.2)/cs5530A */
+#define SMISPDUP        (1<<1)  /* select how SMI re-enable suspend modulation: */
+                                /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
+#define SUSCFG          (1<<2)  /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
+/* the belows support only with cs5530A */
+#define PWRSVE_ISA      (1<<3)  /* stop ISA clock  */
+#define PWRSVE          (1<<4)  /* active idle */
+struct gxfreq_params {
+        u8 on_duration;
+        u8 off_duration;
+        u8 pci_suscfg;
+        u8 pci_pmer1;
+        u8 pci_pmer2;
+        struct pci_dev *cs55x0;
+};
+static struct gxfreq_params *gx_params;
+static int stock_freq;
+/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
+static int pci_busclk = 0;
+module_param (pci_busclk, int, 0444);
+/* maximum duration for which the cpu may be suspended
+ * (32us * MAX_DURATION). If no parameter is given, this defaults
+ * to 255.
+ * Note that this leads to a maximum of 8 ms(!) where the CPU clock
+ * is suspended -- processing power is just 0.39% of what it used to be,
+ * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
+static int max_duration = 255;
+module_param (max_duration, int, 0444);
+/* For the default policy, we want at least some processing power
+ * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
+ */
+#define POLICY_MIN_DIV 20
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "gx-suspmod", msg)
+/**
+ * we can detect a core multipiler from dir0_lsb
+ * from GX1 datasheet p.56,
+ *      MULT[3:0]:
+ *      0000 = SYSCLK multiplied by 4 (test only)
+ *      0001 = SYSCLK multiplied by 10
+ *      0010 = SYSCLK multiplied by 4
+ *      0011 = SYSCLK multiplied by 6
+ *      0100 = SYSCLK multiplied by 9
+ *      0101 = SYSCLK multiplied by 5
+ *      0110 = SYSCLK multiplied by 7
+ *      0111 = SYSCLK multiplied by 8
+ *              of 33.3MHz
+ **/
+static int gx_freq_mult[16] = {
+                4, 10, 4, 6, 9, 5, 7, 8,
+                0, 0, 0, 0, 0, 0, 0, 0
+};
+/****************************************************************
+ *      Low Level chipset interface                             *
+ ****************************************************************/
+static struct pci_device_id gx_chipset_tbl[] __initdata = {
+        { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, PCI_ANY_ID, PCI_ANY_ID },
+        { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, PCI_ANY_ID, PCI_ANY_ID },
+        { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, PCI_ANY_ID, PCI_ANY_ID },
+        { 0, },
+};
+/**
+ * gx_detect_chipset:
+ *
+ **/
+static __init struct pci_dev *gx_detect_chipset(void)
+{
+        struct pci_dev *gx_pci = NULL;
+        /* check if CPU is a MediaGX or a Geode. */
+        if ((current_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
+            (current_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
+                dprintk("error: no MediaGX/Geode processor found!\n");
+                return NULL;
+        }
+        /* detect which companion chip is used */
+        while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) {
+                if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
+                        return gx_pci;
+        }
+        dprintk("error: no supported chipset found!\n");
+        return NULL;
+}
+/**
+ * gx_get_cpuspeed:
+ *
+ * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi Geode CPU runs.
+ */
+static unsigned int gx_get_cpuspeed(unsigned int cpu)
+{
+        if ((gx_params->pci_suscfg & SUSMOD) == 0)
+                return stock_freq;
+        return (stock_freq * gx_params->off_duration)
+                / (gx_params->on_duration + gx_params->off_duration);
+}
+/**
+ *      gx_validate_speed:
+ *      determine current cpu speed
+ *
+ **/
+static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration, u8 *off_duration)
+{
+        unsigned int i;
+        u8 tmp_on, tmp_off;
+        int old_tmp_freq = stock_freq;
+        int tmp_freq;
+        *off_duration=1;
+        *on_duration=0;
+        for (i=max_duration; i>0; i--) {
+                tmp_off = ((khz * i) / stock_freq) & 0xff;
+                tmp_on = i - tmp_off;
+                tmp_freq = (stock_freq * tmp_off) / i;
+                /* if this relation is closer to khz, use this. If it's equal,
+                 * prefer it, too - lower latency */
+                if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
+                        *on_duration = tmp_on;
+                        *off_duration = tmp_off;
+                        old_tmp_freq = tmp_freq;
+                }
+        }
+        return old_tmp_freq;
+}
+/**
+ * gx_set_cpuspeed:
+ * set cpu speed in khz.
+ **/
+static void gx_set_cpuspeed(unsigned int khz)
+{
+        u8 suscfg, pmer1;
+        unsigned int new_khz;
+        unsigned long flags;
+        struct cpufreq_freqs freqs;
+        freqs.cpu = 0;
+        freqs.old = gx_get_cpuspeed(0);
+        new_khz = gx_validate_speed(khz, &gx_params->on_duration, &gx_params->off_duration);
+        freqs.new = new_khz;
+        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        local_irq_save(flags);
+        if (new_khz != stock_freq) {  /* if new khz == 100% of CPU speed, it is special case */
+                switch (gx_params->cs55x0->device) {
+                case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
+                        pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
+                        /* FIXME: need to test other values -- Zwane,Miura */
+                        pci_write_config_byte(gx_params->cs55x0, PCI_IRQTC, 4); /* typical 2 to 4ms */
+                        pci_write_config_byte(gx_params->cs55x0, PCI_VIDTC, 100);/* typical 50 to 100ms */
+                        pci_write_config_byte(gx_params->cs55x0, PCI_PMER1, pmer1);
+                        if (gx_params->cs55x0->revision < 0x10) {   /* CS5530(rev 1.2, 1.3) */
+                                suscfg = gx_params->pci_suscfg | SUSMOD;
+                        } else {                           /* CS5530A,B.. */
+                                suscfg = gx_params->pci_suscfg | SUSMOD | PWRSVE;
+                        }
+                        break;
+                case PCI_DEVICE_ID_CYRIX_5520:
+                case PCI_DEVICE_ID_CYRIX_5510:
+                        suscfg = gx_params->pci_suscfg | SUSMOD;
+                        break;
+                default:
+                        local_irq_restore(flags);
+                        dprintk("fatal: try to set unknown chipset.\n");
+                        return;
+                }
+        } else {
+                suscfg = gx_params->pci_suscfg & ~(SUSMOD);
+                gx_params->off_duration = 0;
+                gx_params->on_duration = 0;
+                dprintk("suspend modulation disabled: cpu runs 100 percent speed.\n");
+        }
+        pci_write_config_byte(gx_params->cs55x0, PCI_MODOFF, gx_params->off_duration);
+        pci_write_config_byte(gx_params->cs55x0, PCI_MODON, gx_params->on_duration);
+        pci_write_config_byte(gx_params->cs55x0, PCI_SUSCFG, suscfg);
+        pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
+        local_irq_restore(flags);
+        gx_params->pci_suscfg = suscfg;
+        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+        dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
+                gx_params->on_duration * 32, gx_params->off_duration * 32);
+        dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
+}
+/****************************************************************
+ *             High level functions                             *
+ ****************************************************************/
+/*
+ *      cpufreq_gx_verify: test if frequency range is valid
+ *
+ *      This function checks if a given frequency range in kHz is valid
+ *      for the hardware supported by the driver.
+ */
+static int cpufreq_gx_verify(struct cpufreq_policy *policy)
+{
+        unsigned int tmp_freq = 0;
+        u8 tmp1, tmp2;
+        if (!stock_freq || !policy)
+                return -EINVAL;
+        policy->cpu = 0;
+        cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq);
+        /* it needs to be assured that at least one supported frequency is
+         * within policy->min and policy->max. If it is not, policy->max
+         * needs to be increased until one freuqency is supported.
+         * policy->min may not be decreased, though. This way we guarantee a
+         * specific processing capacity.
+         */
+        tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
+        if (tmp_freq < policy->min)
+                tmp_freq += stock_freq / max_duration;
+        policy->min = tmp_freq;
+        if (policy->min > policy->max)
+                policy->max = tmp_freq;
+        tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
+        if (tmp_freq > policy->max)
+                tmp_freq -= stock_freq / max_duration;
+        policy->max = tmp_freq;
+        if (policy->max < policy->min)
+                policy->max = policy->min;
+        cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq);
+        return 0;
+}
+/*
+ *      cpufreq_gx_target:
+ *
+ */
+static int cpufreq_gx_target(struct cpufreq_policy *policy,
+                             unsigned int target_freq,
+                             unsigned int relation)
+{
+        u8 tmp1, tmp2;
+        unsigned int tmp_freq;
+        if (!stock_freq || !policy)
+                return -EINVAL;
+        policy->cpu = 0;
+        tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
+        while (tmp_freq < policy->min) {
+                tmp_freq += stock_freq / max_duration;
+                tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
+        }
+        while (tmp_freq > policy->max) {
+                tmp_freq -= stock_freq / max_duration;
+                tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
+        }
+        gx_set_cpuspeed(tmp_freq);
+        return 0;
+}
+static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
+{
+        unsigned int maxfreq, curfreq;
+        if (!policy || policy->cpu != 0)
+                return -ENODEV;
+        /* determine maximum frequency */
+        if (pci_busclk) {
+                maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
+        } else if (cpu_khz) {
+                maxfreq = cpu_khz;
+        } else {
+                maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
+        }
+        stock_freq = maxfreq;
+        curfreq = gx_get_cpuspeed(0);
+        dprintk("cpu max frequency is %d.\n", maxfreq);
+        dprintk("cpu current frequency is %dkHz.\n",curfreq);
+        /* setup basic struct for cpufreq API */
+        policy->cpu = 0;
+        if (max_duration < POLICY_MIN_DIV)
+                policy->min = maxfreq / max_duration;
+        else
+                policy->min = maxfreq / POLICY_MIN_DIV;
+        policy->max = maxfreq;
+        policy->cur = curfreq;
+        policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        policy->cpuinfo.min_freq = maxfreq / max_duration;
+        policy->cpuinfo.max_freq = maxfreq;
+        policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+        return 0;
+}
+/*
+ * cpufreq_gx_init:
+ *   MediaGX/Geode GX initialize cpufreq driver
+ */
+static struct cpufreq_driver gx_suspmod_driver = {
+        .get            = gx_get_cpuspeed,
+        .verify         = cpufreq_gx_verify,
+        .target         = cpufreq_gx_target,
+        .init           = cpufreq_gx_cpu_init,
+        .name           = "gx-suspmod",
+        .owner          = THIS_MODULE,
+};
+static int __init cpufreq_gx_init(void)
+{
+        int ret;
+        struct gxfreq_params *params;
+        struct pci_dev *gx_pci;
+        /* Test if we have the right hardware */
+        if ((gx_pci = gx_detect_chipset()) == NULL)
+                return -ENODEV;
+        /* check whether module parameters are sane */
+        if (max_duration > 0xff)
+                max_duration = 0xff;
+        dprintk("geode suspend modulation available.\n");
+        params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
+        if (params == NULL)
+                return -ENOMEM;
+        params->cs55x0 = gx_pci;
+        gx_params = params;
+        /* keep cs55x0 configurations */
+        pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
+        pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
+        pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
+        pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
+        pci_read_config_byte(params->cs55x0, PCI_MODOFF, &(params->off_duration));
+        if ((ret = cpufreq_register_driver(&gx_suspmod_driver))) {
+                kfree(params);
+                return ret;                   /* register error! */
+        }
+        return 0;
+}
+static void __exit cpufreq_gx_exit(void)
+{
+        cpufreq_unregister_driver(&gx_suspmod_driver);
+        pci_dev_put(gx_params->cs55x0);
+        kfree(gx_params);
+}
+MODULE_AUTHOR ("Hiroshi Miura <miura@da-cha.org>");
+MODULE_DESCRIPTION ("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
+MODULE_LICENSE ("GPL");
+module_init(cpufreq_gx_init);
+module_exit(cpufreq_gx_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
new file mode 100644
index 000000000000..f0cce3c2dc3a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -0,0 +1,1024 @@
+/*
+ *  (C) 2001-2004  Dave Jones. <davej@codemonkey.org.uk>
+ *  (C) 2002  Padraig Brady. <padraig@antefacto.com>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *  Based upon datasheets & sample CPUs kindly provided by VIA.
+ *
+ *  VIA have currently 3 different versions of Longhaul.
+ *  Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
+ *   It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
+ *  Version 2 of longhaul is backward compatible with v1, but adds
+ *   LONGHAUL MSR for purpose of both frequency and voltage scaling.
+ *   Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
+ *  Version 3 of longhaul got renamed to Powersaver and redesigned
+ *   to use only the POWERSAVER MSR at 0x110a.
+ *   It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
+ *   It's pretty much the same feature wise to longhaul v2, though
+ *   there is provision for scaling FSB too, but this doesn't work
+ *   too well in practice so we don't even try to use this.
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <asm/msr.h>
+#include <asm/timex.h>
+#include <asm/io.h>
+#include <asm/acpi.h>
+#include <linux/acpi.h>
+#include <acpi/processor.h>
+#include "longhaul.h"
+#define PFX "longhaul: "
+#define TYPE_LONGHAUL_V1        1
+#define TYPE_LONGHAUL_V2        2
+#define TYPE_POWERSAVER         3
+#define CPU_SAMUEL      1
+#define CPU_SAMUEL2     2
+#define CPU_EZRA        3
+#define CPU_EZRA_T      4
+#define CPU_NEHEMIAH    5
+#define CPU_NEHEMIAH_C  6
+/* Flags */
+#define USE_ACPI_C3             (1 << 1)
+#define USE_NORTHBRIDGE         (1 << 2)
+static int cpu_model;
+static unsigned int numscales=16;
+static unsigned int fsb;
+static const struct mV_pos *vrm_mV_table;
+static const unsigned char *mV_vrm_table;
+static unsigned int highest_speed, lowest_speed; /* kHz */
+static unsigned int minmult, maxmult;
+static int can_scale_voltage;
+static struct acpi_processor *pr = NULL;
+static struct acpi_processor_cx *cx = NULL;
+static u32 acpi_regs_addr;
+static u8 longhaul_flags;
+static unsigned int longhaul_index;
+/* Module parameters */
+static int scale_voltage;
+static int disable_acpi_c3;
+static int revid_errata;
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg)
+/* Clock ratios multiplied by 10 */
+static int clock_ratio[32];
+static int eblcr_table[32];
+static int longhaul_version;
+static struct cpufreq_frequency_table *longhaul_table;
+#ifdef CONFIG_CPU_FREQ_DEBUG
+static char speedbuffer[8];
+static char *print_speed(int speed)
+{
+        if (speed < 1000) {
+                snprintf(speedbuffer, sizeof(speedbuffer),"%dMHz", speed);
+                return speedbuffer;
+        }
+        if (speed%1000 == 0)
+                snprintf(speedbuffer, sizeof(speedbuffer),
+                        "%dGHz", speed/1000);
+        else
+                snprintf(speedbuffer, sizeof(speedbuffer),
+                        "%d.%dGHz", speed/1000, (speed%1000)/100);
+        return speedbuffer;
+}
+#endif
+static unsigned int calc_speed(int mult)
+{
+        int khz;
+        khz = (mult/10)*fsb;
+        if (mult%10)
+                khz += fsb/2;
+        khz *= 1000;
+        return khz;
+}
+static int longhaul_get_cpu_mult(void)
+{
+        unsigned long invalue=0,lo, hi;
+        rdmsr (MSR_IA32_EBL_CR_POWERON, lo, hi);
+        invalue = (lo & (1<<22|1<<23|1<<24|1<<25)) >>22;
+        if (longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) {
+                if (lo & (1<<27))
+                        invalue+=16;
+        }
+        return eblcr_table[invalue];
+}
+/* For processor with BCR2 MSR */
+static void do_longhaul1(unsigned int clock_ratio_index)
+{
+        union msr_bcr2 bcr2;
+        rdmsrl(MSR_VIA_BCR2, bcr2.val);
+        /* Enable software clock multiplier */
+        bcr2.bits.ESOFTBF = 1;
+        bcr2.bits.CLOCKMUL = clock_ratio_index & 0xff;
+        /* Sync to timer tick */
+        safe_halt();
+        /* Change frequency on next halt or sleep */
+        wrmsrl(MSR_VIA_BCR2, bcr2.val);
+        /* Invoke transition */
+        ACPI_FLUSH_CPU_CACHE();
+        halt();
+        /* Disable software clock multiplier */
+        local_irq_disable();
+        rdmsrl(MSR_VIA_BCR2, bcr2.val);
+        bcr2.bits.ESOFTBF = 0;
+        wrmsrl(MSR_VIA_BCR2, bcr2.val);
+}
+/* For processor with Longhaul MSR */
+static void do_powersaver(int cx_address, unsigned int clock_ratio_index,
+                          unsigned int dir)
+{
+        union msr_longhaul longhaul;
+        u32 t;
+        rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+        /* Setup new frequency */
+        if (!revid_errata)
+                longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
+        else
+                longhaul.bits.RevisionKey = 0;
+        longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf;
+        longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4;
+        /* Setup new voltage */
+        if (can_scale_voltage)
+                longhaul.bits.SoftVID = (clock_ratio_index >> 8) & 0x1f;
+        /* Sync to timer tick */
+        safe_halt();
+        /* Raise voltage if necessary */
+        if (can_scale_voltage && dir) {
+                longhaul.bits.EnableSoftVID = 1;
+                wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+                /* Change voltage */
+                if (!cx_address) {
+                        ACPI_FLUSH_CPU_CACHE();
+                        halt();
+                } else {
+                        ACPI_FLUSH_CPU_CACHE();
+                        /* Invoke C3 */
+                        inb(cx_address);
+                        /* Dummy op - must do something useless after P_LVL3
+                         * read */
+                        t = inl(acpi_gbl_FADT.xpm_timer_block.address);
+                }
+                longhaul.bits.EnableSoftVID = 0;
+                wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+        }
+        /* Change frequency on next halt or sleep */
+        longhaul.bits.EnableSoftBusRatio = 1;
+        wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+        if (!cx_address) {
+                ACPI_FLUSH_CPU_CACHE();
+                halt();
+        } else {
+                ACPI_FLUSH_CPU_CACHE();
+                /* Invoke C3 */
+                inb(cx_address);
+                /* Dummy op - must do something useless after P_LVL3 read */
+                t = inl(acpi_gbl_FADT.xpm_timer_block.address);
+        }
+        /* Disable bus ratio bit */
+        longhaul.bits.EnableSoftBusRatio = 0;
+        wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+        /* Reduce voltage if necessary */
+        if (can_scale_voltage && !dir) {
+                longhaul.bits.EnableSoftVID = 1;
+                wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+                /* Change voltage */
+                if (!cx_address) {
+                        ACPI_FLUSH_CPU_CACHE();
+                        halt();
+                } else {
+                        ACPI_FLUSH_CPU_CACHE();
+                        /* Invoke C3 */
+                        inb(cx_address);
+                        /* Dummy op - must do something useless after P_LVL3
+                         * read */
+                        t = inl(acpi_gbl_FADT.xpm_timer_block.address);
+                }
+                longhaul.bits.EnableSoftVID = 0;
+                wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+        }
+}
+/**
+ * longhaul_set_cpu_frequency()
+ * @clock_ratio_index : bitpattern of the new multiplier.
+ *
+ * Sets a new clock ratio.
+ */
+static void longhaul_setstate(unsigned int table_index)
+{
+        unsigned int clock_ratio_index;
+        int speed, mult;
+        struct cpufreq_freqs freqs;
+        unsigned long flags;
+        unsigned int pic1_mask, pic2_mask;
+        u16 bm_status = 0;
+        u32 bm_timeout = 1000;
+        unsigned int dir = 0;
+        clock_ratio_index = longhaul_table[table_index].index;
+        /* Safety precautions */
+        mult = clock_ratio[clock_ratio_index & 0x1f];
+        if (mult == -1)
+                return;
+        speed = calc_speed(mult);
+        if ((speed > highest_speed) || (speed < lowest_speed))
+                return;
+        /* Voltage transition before frequency transition? */
+        if (can_scale_voltage && longhaul_index < table_index)
+                dir = 1;
+        freqs.old = calc_speed(longhaul_get_cpu_mult());
+        freqs.new = speed;
+        freqs.cpu = 0; /* longhaul.c is UP only driver */
+        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        dprintk ("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
+                        fsb, mult/10, mult%10, print_speed(speed/1000));
+retry_loop:
+        preempt_disable();
+        local_irq_save(flags);
+        pic2_mask = inb(0xA1);
+        pic1_mask = inb(0x21);  /* works on C3. save mask. */
+        outb(0xFF,0xA1);        /* Overkill */
+        outb(0xFE,0x21);        /* TMR0 only */
+        /* Wait while PCI bus is busy. */
+        if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
+            || ((pr != NULL) && pr->flags.bm_control))) {
+                bm_status = inw(acpi_regs_addr);
+                bm_status &= 1 << 4;
+                while (bm_status && bm_timeout) {
+                        outw(1 << 4, acpi_regs_addr);
+                        bm_timeout--;
+                        bm_status = inw(acpi_regs_addr);
+                        bm_status &= 1 << 4;
+                }
+        }
+        if (longhaul_flags & USE_NORTHBRIDGE) {
+                /* Disable AGP and PCI arbiters */
+                outb(3, 0x22);
+        } else if ((pr != NULL) && pr->flags.bm_control) {
+                /* Disable bus master arbitration */
+                acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
+        }
+        switch (longhaul_version) {
+        /*
+         * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
+         * Software controlled multipliers only.
+         */
+        case TYPE_LONGHAUL_V1:
+                do_longhaul1(clock_ratio_index);
+                break;
+        /*
+         * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
+         *
+         * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
+         * Nehemiah can do FSB scaling too, but this has never been proven
+         * to work in practice.
+         */
+        case TYPE_LONGHAUL_V2:
+        case TYPE_POWERSAVER:
+                if (longhaul_flags & USE_ACPI_C3) {
+                        /* Don't allow wakeup */
+                        acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
+                        do_powersaver(cx->address, clock_ratio_index, dir);
+                } else {
+                        do_powersaver(0, clock_ratio_index, dir);
+                }
+                break;
+        }
+        if (longhaul_flags & USE_NORTHBRIDGE) {
+                /* Enable arbiters */
+                outb(0, 0x22);
+        } else if ((pr != NULL) && pr->flags.bm_control) {
+                /* Enable bus master arbitration */
+                acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
+        }
+        outb(pic2_mask,0xA1);   /* restore mask */
+        outb(pic1_mask,0x21);
+        local_irq_restore(flags);
+        preempt_enable();
+        freqs.new = calc_speed(longhaul_get_cpu_mult());
+        /* Check if requested frequency is set. */
+        if (unlikely(freqs.new != speed)) {
+                printk(KERN_INFO PFX "Failed to set requested frequency!\n");
+                /* Revision ID = 1 but processor is expecting revision key
+                 * equal to 0. Jumpers at the bottom of processor will change
+                 * multiplier and FSB, but will not change bits in Longhaul
+                 * MSR nor enable voltage scaling. */
+                if (!revid_errata) {
+                        printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
+                                                "option.\n");
+                        revid_errata = 1;
+                        msleep(200);
+                        goto retry_loop;
+                }
+                /* Why ACPI C3 sometimes doesn't work is a mystery for me.
+                 * But it does happen. Processor is entering ACPI C3 state,
+                 * but it doesn't change frequency. I tried poking various
+                 * bits in northbridge registers, but without success. */
+                if (longhaul_flags & USE_ACPI_C3) {
+                        printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
+                        longhaul_flags &= ~USE_ACPI_C3;
+                        if (revid_errata) {
+                                printk(KERN_INFO PFX "Disabling \"Ignore "
+                                                "Revision ID\" option.\n");
+                                revid_errata = 0;
+                        }
+                        msleep(200);
+                        goto retry_loop;
+                }
+                /* This shouldn't happen. Longhaul ver. 2 was reported not
+                 * working on processors without voltage scaling, but with
+                 * RevID = 1. RevID errata will make things right. Just
+                 * to be 100% sure. */
+                if (longhaul_version == TYPE_LONGHAUL_V2) {
+                        printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
+                        longhaul_version = TYPE_LONGHAUL_V1;
+                        msleep(200);
+                        goto retry_loop;
+                }
+        }
+        /* Report true CPU frequency */
+        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+        if (!bm_timeout)
+                printk(KERN_INFO PFX "Warning: Timeout while waiting for idle PCI bus.\n");
+}
+/*
+ * Centaur decided to make life a little more tricky.
+ * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
+ * Samuel2 and above have to try and guess what the FSB is.
+ * We do this by assuming we booted at maximum multiplier, and interpolate
+ * between that value multiplied by possible FSBs and cpu_mhz which
+ * was calculated at boot time. Really ugly, but no other way to do this.
+ */
+#define ROUNDING        0xf
+static int guess_fsb(int mult)
+{
+        int speed = cpu_khz / 1000;
+        int i;
+        int speeds[] = { 666, 1000, 1333, 2000 };
+        int f_max, f_min;
+        for (i = 0; i < 4; i++) {
+                f_max = ((speeds[i] * mult) + 50) / 100;
+                f_max += (ROUNDING / 2);
+                f_min = f_max - ROUNDING;
+                if ((speed <= f_max) && (speed >= f_min))
+                        return speeds[i] / 10;
+        }
+        return 0;
+}
+static int __init longhaul_get_ranges(void)
+{
+        unsigned int i, j, k = 0;
+        unsigned int ratio;
+        int mult;
+        /* Get current frequency */
+        mult = longhaul_get_cpu_mult();
+        if (mult == -1) {
+                printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
+                return -EINVAL;
+        }
+        fsb = guess_fsb(mult);
+        if (fsb == 0) {
+                printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
+                return -EINVAL;
+        }
+        /* Get max multiplier - as we always did.
+         * Longhaul MSR is usefull only when voltage scaling is enabled.
+         * C3 is booting at max anyway. */
+        maxmult = mult;
+        /* Get min multiplier */
+        switch (cpu_model) {
+        case CPU_NEHEMIAH:
+                minmult = 50;
+                break;
+        case CPU_NEHEMIAH_C:
+                minmult = 40;
+                break;
+        default:
+                minmult = 30;
+                break;
+        }
+        dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n",
+                 minmult/10, minmult%10, maxmult/10, maxmult%10);
+        highest_speed = calc_speed(maxmult);
+        lowest_speed = calc_speed(minmult);
+        dprintk ("FSB:%dMHz  Lowest speed: %s   Highest speed:%s\n", fsb,
+                 print_speed(lowest_speed/1000),
+                 print_speed(highest_speed/1000));
+        if (lowest_speed == highest_speed) {
+                printk (KERN_INFO PFX "highestspeed == lowest, aborting.\n");
+                return -EINVAL;
+        }
+        if (lowest_speed > highest_speed) {
+                printk (KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
+                        lowest_speed, highest_speed);
+                return -EINVAL;
+        }
+        longhaul_table = kmalloc((numscales + 1) * sizeof(struct cpufreq_frequency_table), GFP_KERNEL);
+        if(!longhaul_table)
+                return -ENOMEM;
+        for (j = 0; j < numscales; j++) {
+                ratio = clock_ratio[j];
+                if (ratio == -1)
+                        continue;
+                if (ratio > maxmult || ratio < minmult)
+                        continue;
+                longhaul_table[k].frequency = calc_speed(ratio);
+                longhaul_table[k].index = j;
+                k++;
+        }
+        if (k <= 1) {
+                kfree(longhaul_table);
+                return -ENODEV;
+        }
+        /* Sort */
+        for (j = 0; j < k - 1; j++) {
+                unsigned int min_f, min_i;
+                min_f = longhaul_table[j].frequency;
+                min_i = j;
+                for (i = j + 1; i < k; i++) {
+                        if (longhaul_table[i].frequency < min_f) {
+                                min_f = longhaul_table[i].frequency;
+                                min_i = i;
+                        }
+                }
+                if (min_i != j) {
+                        unsigned int temp;
+                        temp = longhaul_table[j].frequency;
+                        longhaul_table[j].frequency = longhaul_table[min_i].frequency;
+                        longhaul_table[min_i].frequency = temp;
+                        temp = longhaul_table[j].index;
+                        longhaul_table[j].index = longhaul_table[min_i].index;
+                        longhaul_table[min_i].index = temp;
+                }
+        }
+        longhaul_table[k].frequency = CPUFREQ_TABLE_END;
+        /* Find index we are running on */
+        for (j = 0; j < k; j++) {
+                if (clock_ratio[longhaul_table[j].index & 0x1f] == mult) {
+                        longhaul_index = j;
+                        break;
+                }
+        }
+        return 0;
+}
+static void __init longhaul_setup_voltagescaling(void)
+{
+        union msr_longhaul longhaul;
+        struct mV_pos minvid, maxvid, vid;
+        unsigned int j, speed, pos, kHz_step, numvscales;
+        int min_vid_speed;
+        rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+        if (!(longhaul.bits.RevisionID & 1)) {
+                printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
+                return;
+        }
+        if (!longhaul.bits.VRMRev) {
+                printk(KERN_INFO PFX "VRM 8.5\n");
+                vrm_mV_table = &vrm85_mV[0];
+                mV_vrm_table = &mV_vrm85[0];
+        } else {
+                printk(KERN_INFO PFX "Mobile VRM\n");
+                if (cpu_model < CPU_NEHEMIAH)
+                        return;
+                vrm_mV_table = &mobilevrm_mV[0];
+                mV_vrm_table = &mV_mobilevrm[0];
+        }
+        minvid = vrm_mV_table[longhaul.bits.MinimumVID];
+        maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
+        if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
+                printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
+                                        "Voltage scaling disabled.\n",
+                                        minvid.mV/1000, minvid.mV%1000, maxvid.mV/1000, maxvid.mV%1000);
+                return;
+        }
+        if (minvid.mV == maxvid.mV) {
+                printk (KERN_INFO PFX "Claims to support voltage scaling but min & max are "
+                                "both %d.%03d. Voltage scaling disabled\n",
+                                maxvid.mV/1000, maxvid.mV%1000);
+                return;
+        }
+        /* How many voltage steps */
+        numvscales = maxvid.pos - minvid.pos + 1;
+        printk(KERN_INFO PFX
+                "Max VID=%d.%03d  "
+                "Min VID=%d.%03d, "
+                "%d possible voltage scales\n",
+                maxvid.mV/1000, maxvid.mV%1000,
+                minvid.mV/1000, minvid.mV%1000,
+                numvscales);
+        /* Calculate max frequency at min voltage */
+        j = longhaul.bits.MinMHzBR;
+        if (longhaul.bits.MinMHzBR4)
+                j += 16;
+        min_vid_speed = eblcr_table[j];
+        if (min_vid_speed == -1)
+                return;
+        switch (longhaul.bits.MinMHzFSB) {
+        case 0:
+                min_vid_speed *= 13333;
+                break;
+        case 1:
+                min_vid_speed *= 10000;
+                break;
+        case 3:
+                min_vid_speed *= 6666;
+                break;
+        default:
+                return;
+                break;
+        }
+        if (min_vid_speed >= highest_speed)
+                return;
+        /* Calculate kHz for one voltage step */
+        kHz_step = (highest_speed - min_vid_speed) / numvscales;
+        j = 0;
+        while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
+                speed = longhaul_table[j].frequency;
+                if (speed > min_vid_speed)
+                        pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
+                else
+                        pos = minvid.pos;
+                longhaul_table[j].index |= mV_vrm_table[pos] << 8;
+                vid = vrm_mV_table[mV_vrm_table[pos]];
+                printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n", speed, j, vid.mV);
+                j++;
+        }
+        can_scale_voltage = 1;
+        printk(KERN_INFO PFX "Voltage scaling enabled.\n");
+}
+static int longhaul_verify(struct cpufreq_policy *policy)
+{
+        return cpufreq_frequency_table_verify(policy, longhaul_table);
+}
+static int longhaul_target(struct cpufreq_policy *policy,
+                            unsigned int target_freq, unsigned int relation)
+{
+        unsigned int table_index = 0;
+        unsigned int i;
+        unsigned int dir = 0;
+        u8 vid, current_vid;
+        if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, relation, &table_index))
+                return -EINVAL;
+        /* Don't set same frequency again */
+        if (longhaul_index == table_index)
+                return 0;
+        if (!can_scale_voltage)
+                longhaul_setstate(table_index);
+        else {
+                /* On test system voltage transitions exceeding single
+                 * step up or down were turning motherboard off. Both
+                 * "ondemand" and "userspace" are unsafe. C7 is doing
+                 * this in hardware, C3 is old and we need to do this
+                 * in software. */
+                i = longhaul_index;
+                current_vid = (longhaul_table[longhaul_index].index >> 8) & 0x1f;
+                if (table_index > longhaul_index)
+                        dir = 1;
+                while (i != table_index) {
+                        vid = (longhaul_table[i].index >> 8) & 0x1f;
+                        if (vid != current_vid) {
+                                longhaul_setstate(i);
+                                current_vid = vid;
+                                msleep(200);
+                        }
+                        if (dir)
+                                i++;
+                        else
+                                i--;
+                }
+                longhaul_setstate(table_index);
+        }
+        longhaul_index = table_index;
+        return 0;
+}
+static unsigned int longhaul_get(unsigned int cpu)
+{
+        if (cpu)
+                return 0;
+        return calc_speed(longhaul_get_cpu_mult());
+}
+static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
+                                          u32 nesting_level,
+                                          void *context, void **return_value)
+{
+        struct acpi_device *d;
+        if ( acpi_bus_get_device(obj_handle, &d) ) {
+                return 0;
+        }
+        *return_value = (void *)acpi_driver_data(d);
+        return 1;
+}
+/* VIA don't support PM2 reg, but have something similar */
+static int enable_arbiter_disable(void)
+{
+        struct pci_dev *dev;
+        int status = 1;
+        int reg;
+        u8 pci_cmd;
+        /* Find PLE133 host bridge */
+        reg = 0x78;
+        dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
+                             NULL);
+        /* Find CLE266 host bridge */
+        if (dev == NULL) {
+                reg = 0x76;
+                dev = pci_get_device(PCI_VENDOR_ID_VIA,
+                                     PCI_DEVICE_ID_VIA_862X_0, NULL);
+                /* Find CN400 V-Link host bridge */
+                if (dev == NULL)
+                        dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
+        }
+        if (dev != NULL) {
+                /* Enable access to port 0x22 */
+                pci_read_config_byte(dev, reg, &pci_cmd);
+                if (!(pci_cmd & 1<<7)) {
+                        pci_cmd |= 1<<7;
+                        pci_write_config_byte(dev, reg, pci_cmd);
+                        pci_read_config_byte(dev, reg, &pci_cmd);
+                        if (!(pci_cmd & 1<<7)) {
+                                printk(KERN_ERR PFX
+                                        "Can't enable access to port 0x22.\n");
+                                status = 0;
+                        }
+                }
+                pci_dev_put(dev);
+                return status;
+        }
+        return 0;
+}
+static int longhaul_setup_southbridge(void)
+{
+        struct pci_dev *dev;
+        u8 pci_cmd;
+        /* Find VT8235 southbridge */
+        dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
+        if (dev == NULL)
+        /* Find VT8237 southbridge */
+                dev = pci_get_device(PCI_VENDOR_ID_VIA,
+                                     PCI_DEVICE_ID_VIA_8237, NULL);
+        if (dev != NULL) {
+                /* Set transition time to max */
+                pci_read_config_byte(dev, 0xec, &pci_cmd);
+                pci_cmd &= ~(1 << 2);
+                pci_write_config_byte(dev, 0xec, pci_cmd);
+                pci_read_config_byte(dev, 0xe4, &pci_cmd);
+                pci_cmd &= ~(1 << 7);
+                pci_write_config_byte(dev, 0xe4, pci_cmd);
+                pci_read_config_byte(dev, 0xe5, &pci_cmd);
+                pci_cmd |= 1 << 7;
+                pci_write_config_byte(dev, 0xe5, pci_cmd);
+                /* Get address of ACPI registers block*/
+                pci_read_config_byte(dev, 0x81, &pci_cmd);
+                if (pci_cmd & 1 << 7) {
+                        pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
+                        acpi_regs_addr &= 0xff00;
+                        printk(KERN_INFO PFX "ACPI I/O at 0x%x\n", acpi_regs_addr);
+                }
+                pci_dev_put(dev);
+                return 1;
+        }
+        return 0;
+}
+static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
+{
+        struct cpuinfo_x86 *c = cpu_data;
+        char *cpuname=NULL;
+        int ret;
+        u32 lo, hi;
+        /* Check what we have on this motherboard */
+        switch (c->x86_model) {
+        case 6:
+                cpu_model = CPU_SAMUEL;
+                cpuname = "C3 'Samuel' [C5A]";
+                longhaul_version = TYPE_LONGHAUL_V1;
+                memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio));
+                memcpy (eblcr_table, samuel1_eblcr, sizeof(samuel1_eblcr));
+                break;
+        case 7:
+                switch (c->x86_mask) {
+                case 0:
+                        longhaul_version = TYPE_LONGHAUL_V1;
+                        cpu_model = CPU_SAMUEL2;
+                        cpuname = "C3 'Samuel 2' [C5B]";
+                        /* Note, this is not a typo, early Samuel2's had
+                         * Samuel1 ratios. */
+                        memcpy(clock_ratio, samuel1_clock_ratio,
+                                sizeof(samuel1_clock_ratio));
+                        memcpy(eblcr_table, samuel2_eblcr,
+                                sizeof(samuel2_eblcr));
+                        break;
+                case 1 ... 15:
+                        longhaul_version = TYPE_LONGHAUL_V1;
+                        if (c->x86_mask < 8) {
+                                cpu_model = CPU_SAMUEL2;
+                                cpuname = "C3 'Samuel 2' [C5B]";
+                        } else {
+                                cpu_model = CPU_EZRA;
+                                cpuname = "C3 'Ezra' [C5C]";
+                        }
+                        memcpy(clock_ratio, ezra_clock_ratio,
+                                sizeof(ezra_clock_ratio));
+                        memcpy(eblcr_table, ezra_eblcr,
+                                sizeof(ezra_eblcr));
+                        break;
+                }
+                break;
+        case 8:
+                cpu_model = CPU_EZRA_T;
+                cpuname = "C3 'Ezra-T' [C5M]";
+                longhaul_version = TYPE_POWERSAVER;
+                numscales=32;
+                memcpy (clock_ratio, ezrat_clock_ratio, sizeof(ezrat_clock_ratio));
+                memcpy (eblcr_table, ezrat_eblcr, sizeof(ezrat_eblcr));
+                break;
+        case 9:
+                longhaul_version = TYPE_POWERSAVER;
+                numscales = 32;
+                memcpy(clock_ratio,
+                       nehemiah_clock_ratio,
+                       sizeof(nehemiah_clock_ratio));
+                memcpy(eblcr_table, nehemiah_eblcr, sizeof(nehemiah_eblcr));
+                switch (c->x86_mask) {
+                case 0 ... 1:
+                        cpu_model = CPU_NEHEMIAH;
+                        cpuname = "C3 'Nehemiah A' [C5XLOE]";
+                        break;
+                case 2 ... 4:
+                        cpu_model = CPU_NEHEMIAH;
+                        cpuname = "C3 'Nehemiah B' [C5XLOH]";
+                        break;
+                case 5 ... 15:
+                        cpu_model = CPU_NEHEMIAH_C;
+                        cpuname = "C3 'Nehemiah C' [C5P]";
+                        break;
+                }
+                break;
+        default:
+                cpuname = "Unknown";
+                break;
+        }
+        /* Check Longhaul ver. 2 */
+        if (longhaul_version == TYPE_LONGHAUL_V2) {
+                rdmsr(MSR_VIA_LONGHAUL, lo, hi);
+                if (lo == 0 && hi == 0)
+                        /* Looks like MSR isn't present */
+                        longhaul_version = TYPE_LONGHAUL_V1;
+        }
+        printk (KERN_INFO PFX "VIA %s CPU detected.  ", cpuname);
+        switch (longhaul_version) {
+        case TYPE_LONGHAUL_V1:
+        case TYPE_LONGHAUL_V2:
+                printk ("Longhaul v%d supported.\n", longhaul_version);
+                break;
+        case TYPE_POWERSAVER:
+                printk ("Powersaver supported.\n");
+                break;
+        };
+        /* Doesn't hurt */
+        longhaul_setup_southbridge();
+        /* Find ACPI data for processor */
+        acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
+                                ACPI_UINT32_MAX, &longhaul_walk_callback,
+                                NULL, (void *)&pr);
+        /* Check ACPI support for C3 state */
+        if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
+                cx = &pr->power.states[ACPI_STATE_C3];
+                if (cx->address > 0 && cx->latency <= 1000)
+                        longhaul_flags |= USE_ACPI_C3;
+        }
+        /* Disable if it isn't working */
+        if (disable_acpi_c3)
+                longhaul_flags &= ~USE_ACPI_C3;
+        /* Check if northbridge is friendly */
+        if (enable_arbiter_disable())
+                longhaul_flags |= USE_NORTHBRIDGE;
+        /* Check ACPI support for bus master arbiter disable */
+        if (!(longhaul_flags & USE_ACPI_C3
+             || longhaul_flags & USE_NORTHBRIDGE)
+            && ((pr == NULL) || !(pr->flags.bm_control))) {
+                printk(KERN_ERR PFX
+                        "No ACPI support. Unsupported northbridge.\n");
+                return -ENODEV;
+        }
+        if (longhaul_flags & USE_NORTHBRIDGE)
+                printk(KERN_INFO PFX "Using northbridge support.\n");
+        if (longhaul_flags & USE_ACPI_C3)
+                printk(KERN_INFO PFX "Using ACPI support.\n");
+        ret = longhaul_get_ranges();
+        if (ret != 0)
+                return ret;
+        if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
+                longhaul_setup_voltagescaling();
+        policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        policy->cpuinfo.transition_latency = 200000;    /* nsec */
+        policy->cur = calc_speed(longhaul_get_cpu_mult());
+        ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
+        if (ret)
+                return ret;
+        cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
+        return 0;
+}
+static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
+{
+        cpufreq_frequency_table_put_attr(policy->cpu);
+        return 0;
+}
+static struct freq_attr* longhaul_attr[] = {
+        &cpufreq_freq_attr_scaling_available_freqs,
+        NULL,
+};
+static struct cpufreq_driver longhaul_driver = {
+        .verify = longhaul_verify,
+        .target = longhaul_target,
+        .get    = longhaul_get,
+        .init   = longhaul_cpu_init,
+        .exit   = __devexit_p(longhaul_cpu_exit),
+        .name   = "longhaul",
+        .owner  = THIS_MODULE,
+        .attr   = longhaul_attr,
+};
+static int __init longhaul_init(void)
+{
+        struct cpuinfo_x86 *c = cpu_data;
+        if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
+                return -ENODEV;
+#ifdef CONFIG_SMP
+        if (num_online_cpus() > 1) {
+                printk(KERN_ERR PFX "More than 1 CPU detected, longhaul disabled.\n");
+                return -ENODEV;
+        }
+#endif
+#ifdef CONFIG_X86_IO_APIC
+        if (cpu_has_apic) {
+                printk(KERN_ERR PFX "APIC detected. Longhaul is currently broken in this configuration.\n");
+                return -ENODEV;
+        }
+#endif
+        switch (c->x86_model) {
+        case 6 ... 9:
+                return cpufreq_register_driver(&longhaul_driver);
+        case 10:
+                printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
+        default:
+                ;;
+        }
+        return -ENODEV;
+}
+static void __exit longhaul_exit(void)
+{
+        int i;
+        for (i=0; i < numscales; i++) {
+                if (clock_ratio[i] == maxmult) {
+                        longhaul_setstate(i);
+                        break;
+                }
+        }
+        cpufreq_unregister_driver(&longhaul_driver);
+        kfree(longhaul_table);
+}
+/* Even if BIOS is exporting ACPI C3 state, and it is used
+ * with success when CPU is idle, this state doesn't
+ * trigger frequency transition in some cases. */
+module_param (disable_acpi_c3, int, 0644);
+MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
+/* Change CPU voltage with frequency. Very usefull to save
+ * power, but most VIA C3 processors aren't supporting it. */
+module_param (scale_voltage, int, 0644);
+MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
+/* Force revision key to 0 for processors which doesn't
+ * support voltage scaling, but are introducing itself as
+ * such. */
+module_param(revid_errata, int, 0644);
+MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
+MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
+MODULE_LICENSE ("GPL");
+late_initcall(longhaul_init);
+module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
new file mode 100644
index 000000000000..4fcc320997df
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -0,0 +1,353 @@
+/*
+ *  longhaul.h
+ *  (C) 2003 Dave Jones.
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  VIA-specific information
+ */
+union msr_bcr2 {
+        struct {
+                unsigned Reseved:19,    // 18:0
+                ESOFTBF:1,              // 19
+                Reserved2:3,            // 22:20
+                CLOCKMUL:4,             // 26:23
+                Reserved3:5;            // 31:27
+        } bits;
+        unsigned long val;
+};
+union msr_longhaul {
+        struct {
+                unsigned RevisionID:4,  // 3:0
+                RevisionKey:4,          // 7:4
+                EnableSoftBusRatio:1,   // 8
+                EnableSoftVID:1,        // 9
+                EnableSoftBSEL:1,       // 10
+                Reserved:3,             // 11:13
+                SoftBusRatio4:1,        // 14
+                VRMRev:1,               // 15
+                SoftBusRatio:4,         // 19:16
+                SoftVID:5,              // 24:20
+                Reserved2:3,            // 27:25
+                SoftBSEL:2,             // 29:28
+                Reserved3:2,            // 31:30
+                MaxMHzBR:4,             // 35:32
+                MaximumVID:5,           // 40:36
+                MaxMHzFSB:2,            // 42:41
+                MaxMHzBR4:1,            // 43
+                Reserved4:4,            // 47:44
+                MinMHzBR:4,             // 51:48
+                MinimumVID:5,           // 56:52
+                MinMHzFSB:2,            // 58:57
+                MinMHzBR4:1,            // 59
+                Reserved5:4;            // 63:60
+        } bits;
+        unsigned long long val;
+};
+/*
+ * Clock ratio tables. Div/Mod by 10 to get ratio.
+ * The eblcr ones specify the ratio read from the CPU.
+ * The clock_ratio ones specify what to write to the CPU.
+ */
+/*
+ * VIA C3 Samuel 1  & Samuel 2 (stepping 0)
+ */
+static const int __initdata samuel1_clock_ratio[16] = {
+        -1, /* 0000 -> RESERVED */
+        30, /* 0001 ->  3.0x */
+        40, /* 0010 ->  4.0x */
+        -1, /* 0011 -> RESERVED */
+        -1, /* 0100 -> RESERVED */
+        35, /* 0101 ->  3.5x */
+        45, /* 0110 ->  4.5x */
+        55, /* 0111 ->  5.5x */
+        60, /* 1000 ->  6.0x */
+        70, /* 1001 ->  7.0x */
+        80, /* 1010 ->  8.0x */
+        50, /* 1011 ->  5.0x */
+        65, /* 1100 ->  6.5x */
+        75, /* 1101 ->  7.5x */
+        -1, /* 1110 -> RESERVED */
+        -1, /* 1111 -> RESERVED */
+};
+static const int __initdata samuel1_eblcr[16] = {
+        50, /* 0000 -> RESERVED */
+        30, /* 0001 ->  3.0x */
+        40, /* 0010 ->  4.0x */
+        -1, /* 0011 -> RESERVED */
+        55, /* 0100 ->  5.5x */
+        35, /* 0101 ->  3.5x */
+        45, /* 0110 ->  4.5x */
+        -1, /* 0111 -> RESERVED */
+        -1, /* 1000 -> RESERVED */
+        70, /* 1001 ->  7.0x */
+        80, /* 1010 ->  8.0x */
+        60, /* 1011 ->  6.0x */
+        -1, /* 1100 -> RESERVED */
+        75, /* 1101 ->  7.5x */
+        -1, /* 1110 -> RESERVED */
+        65, /* 1111 ->  6.5x */
+};
+/*
+ * VIA C3 Samuel2 Stepping 1->15
+ */
+static const int __initdata samuel2_eblcr[16] = {
+        50,  /* 0000 ->  5.0x */
+        30,  /* 0001 ->  3.0x */
+        40,  /* 0010 ->  4.0x */
+        100, /* 0011 -> 10.0x */
+        55,  /* 0100 ->  5.5x */
+        35,  /* 0101 ->  3.5x */
+        45,  /* 0110 ->  4.5x */
+        110, /* 0111 -> 11.0x */
+        90,  /* 1000 ->  9.0x */
+        70,  /* 1001 ->  7.0x */
+        80,  /* 1010 ->  8.0x */
+        60,  /* 1011 ->  6.0x */
+        120, /* 1100 -> 12.0x */
+        75,  /* 1101 ->  7.5x */
+        130, /* 1110 -> 13.0x */
+        65,  /* 1111 ->  6.5x */
+};
+/*
+ * VIA C3 Ezra
+ */
+static const int __initdata ezra_clock_ratio[16] = {
+        100, /* 0000 -> 10.0x */
+        30,  /* 0001 ->  3.0x */
+        40,  /* 0010 ->  4.0x */
+        90,  /* 0011 ->  9.0x */
+        95,  /* 0100 ->  9.5x */
+        35,  /* 0101 ->  3.5x */
+        45,  /* 0110 ->  4.5x */
+        55,  /* 0111 ->  5.5x */
+        60,  /* 1000 ->  6.0x */
+        70,  /* 1001 ->  7.0x */
+        80,  /* 1010 ->  8.0x */
+        50,  /* 1011 ->  5.0x */
+        65,  /* 1100 ->  6.5x */
+        75,  /* 1101 ->  7.5x */
+        85,  /* 1110 ->  8.5x */
+        120, /* 1111 -> 12.0x */
+};
+static const int __initdata ezra_eblcr[16] = {
+        50,  /* 0000 ->  5.0x */
+        30,  /* 0001 ->  3.0x */
+        40,  /* 0010 ->  4.0x */
+        100, /* 0011 -> 10.0x */
+        55,  /* 0100 ->  5.5x */
+        35,  /* 0101 ->  3.5x */
+        45,  /* 0110 ->  4.5x */
+        95,  /* 0111 ->  9.5x */
+        90,  /* 1000 ->  9.0x */
+        70,  /* 1001 ->  7.0x */
+        80,  /* 1010 ->  8.0x */
+        60,  /* 1011 ->  6.0x */
+        120, /* 1100 -> 12.0x */
+        75,  /* 1101 ->  7.5x */
+        85,  /* 1110 ->  8.5x */
+        65,  /* 1111 ->  6.5x */
+};
+/*
+ * VIA C3 (Ezra-T) [C5M].
+ */
+static const int __initdata ezrat_clock_ratio[32] = {
+        100, /* 0000 -> 10.0x */
+        30,  /* 0001 ->  3.0x */
+        40,  /* 0010 ->  4.0x */
+        90,  /* 0011 ->  9.0x */
+        95,  /* 0100 ->  9.5x */
+        35,  /* 0101 ->  3.5x */
+        45,  /* 0110 ->  4.5x */
+        55,  /* 0111 ->  5.5x */
+        60,  /* 1000 ->  6.0x */
+        70,  /* 1001 ->  7.0x */
+        80,  /* 1010 ->  8.0x */
+        50,  /* 1011 ->  5.0x */
+        65,  /* 1100 ->  6.5x */
+        75,  /* 1101 ->  7.5x */
+        85,  /* 1110 ->  8.5x */
+        120, /* 1111 ->  12.0x */
+        -1,  /* 0000 -> RESERVED (10.0x) */
+        110, /* 0001 -> 11.0x */
+        -1, /* 0010 -> 12.0x */
+        -1,  /* 0011 -> RESERVED (9.0x)*/
+        105, /* 0100 -> 10.5x */
+        115, /* 0101 -> 11.5x */
+        125, /* 0110 -> 12.5x */
+        135, /* 0111 -> 13.5x */
+        140, /* 1000 -> 14.0x */
+        150, /* 1001 -> 15.0x */
+        160, /* 1010 -> 16.0x */
+        130, /* 1011 -> 13.0x */
+        145, /* 1100 -> 14.5x */
+        155, /* 1101 -> 15.5x */
+        -1,  /* 1110 -> RESERVED (13.0x) */
+        -1,  /* 1111 -> RESERVED (12.0x) */
+};
+static const int __initdata ezrat_eblcr[32] = {
+        50,  /* 0000 ->  5.0x */
+        30,  /* 0001 ->  3.0x */
+        40,  /* 0010 ->  4.0x */
+        100, /* 0011 -> 10.0x */
+        55,  /* 0100 ->  5.5x */
+        35,  /* 0101 ->  3.5x */
+        45,  /* 0110 ->  4.5x */
+        95,  /* 0111 ->  9.5x */
+        90,  /* 1000 ->  9.0x */
+        70,  /* 1001 ->  7.0x */
+        80,  /* 1010 ->  8.0x */
+        60,  /* 1011 ->  6.0x */
+        120, /* 1100 -> 12.0x */
+        75,  /* 1101 ->  7.5x */
+        85,  /* 1110 ->  8.5x */
+        65,  /* 1111 ->  6.5x */
+        -1,  /* 0000 -> RESERVED (9.0x) */
+        110, /* 0001 -> 11.0x */
+        120, /* 0010 -> 12.0x */
+        -1,  /* 0011 -> RESERVED (10.0x)*/
+        135, /* 0100 -> 13.5x */
+        115, /* 0101 -> 11.5x */
+        125, /* 0110 -> 12.5x */
+        105, /* 0111 -> 10.5x */
+        130, /* 1000 -> 13.0x */
+        150, /* 1001 -> 15.0x */
+        160, /* 1010 -> 16.0x */
+        140, /* 1011 -> 14.0x */
+        -1,  /* 1100 -> RESERVED (12.0x) */
+        155, /* 1101 -> 15.5x */
+        -1,  /* 1110 -> RESERVED (13.0x) */
+        145, /* 1111 -> 14.5x */
+};
+/*
+ * VIA C3 Nehemiah */
+static const int __initdata  nehemiah_clock_ratio[32] = {
+        100, /* 0000 -> 10.0x */
+        -1, /* 0001 -> 16.0x */
+        40,  /* 0010 ->  4.0x */
+        90,  /* 0011 ->  9.0x */
+        95,  /* 0100 ->  9.5x */
+        -1,  /* 0101 ->  RESERVED */
+        45,  /* 0110 ->  4.5x */
+        55,  /* 0111 ->  5.5x */
+        60,  /* 1000 ->  6.0x */
+        70,  /* 1001 ->  7.0x */
+        80,  /* 1010 ->  8.0x */
+        50,  /* 1011 ->  5.0x */
+        65,  /* 1100 ->  6.5x */
+        75,  /* 1101 ->  7.5x */
+        85,  /* 1110 ->  8.5x */
+        120, /* 1111 -> 12.0x */
+        -1, /* 0000 -> 10.0x */
+        110, /* 0001 -> 11.0x */
+        -1, /* 0010 -> 12.0x */
+        -1,  /* 0011 ->  9.0x */
+        105, /* 0100 -> 10.5x */
+        115, /* 0101 -> 11.5x */
+        125, /* 0110 -> 12.5x */
+        135, /* 0111 -> 13.5x */
+        140, /* 1000 -> 14.0x */
+        150, /* 1001 -> 15.0x */
+        160, /* 1010 -> 16.0x */
+        130, /* 1011 -> 13.0x */
+        145, /* 1100 -> 14.5x */
+        155, /* 1101 -> 15.5x */
+        -1,  /* 1110 -> RESERVED (13.0x) */
+        -1, /* 1111 -> 12.0x */
+};
+static const int __initdata nehemiah_eblcr[32] = {
+        50,  /* 0000 ->  5.0x */
+        160, /* 0001 -> 16.0x */
+        40,  /* 0010 ->  4.0x */
+        100, /* 0011 -> 10.0x */
+        55,  /* 0100 ->  5.5x */
+        -1,  /* 0101 ->  RESERVED */
+        45,  /* 0110 ->  4.5x */
+        95,  /* 0111 ->  9.5x */
+        90,  /* 1000 ->  9.0x */
+        70,  /* 1001 ->  7.0x */
+        80,  /* 1010 ->  8.0x */
+        60,  /* 1011 ->  6.0x */
+        120, /* 1100 -> 12.0x */
+        75,  /* 1101 ->  7.5x */
+        85,  /* 1110 ->  8.5x */
+        65,  /* 1111 ->  6.5x */
+        90,  /* 0000 ->  9.0x */
+        110, /* 0001 -> 11.0x */
+        120, /* 0010 -> 12.0x */
+        100, /* 0011 -> 10.0x */
+        135, /* 0100 -> 13.5x */
+        115, /* 0101 -> 11.5x */
+        125, /* 0110 -> 12.5x */
+        105, /* 0111 -> 10.5x */
+        130, /* 1000 -> 13.0x */
+        150, /* 1001 -> 15.0x */
+        160, /* 1010 -> 16.0x */
+        140, /* 1011 -> 14.0x */
+        120, /* 1100 -> 12.0x */
+        155, /* 1101 -> 15.5x */
+        -1,  /* 1110 -> RESERVED (13.0x) */
+        145 /* 1111 -> 14.5x */
+};
+/*
+ * Voltage scales. Div/Mod by 1000 to get actual voltage.
+ * Which scale to use depends on the VRM type in use.
+ */
+struct mV_pos {
+        unsigned short mV;
+        unsigned short pos;
+};
+static const struct mV_pos __initdata vrm85_mV[32] = {
+        {1250, 8},      {1200, 6},      {1150, 4},      {1100, 2},
+        {1050, 0},      {1800, 30},     {1750, 28},     {1700, 26},
+        {1650, 24},     {1600, 22},     {1550, 20},     {1500, 18},
+        {1450, 16},     {1400, 14},     {1350, 12},     {1300, 10},
+        {1275, 9},      {1225, 7},      {1175, 5},      {1125, 3},
+        {1075, 1},      {1825, 31},     {1775, 29},     {1725, 27},
+        {1675, 25},     {1625, 23},     {1575, 21},     {1525, 19},
+        {1475, 17},     {1425, 15},     {1375, 13},     {1325, 11}
+};
+static const unsigned char __initdata mV_vrm85[32] = {
+        0x04,   0x14,   0x03,   0x13,   0x02,   0x12,   0x01,   0x11,
+        0x00,   0x10,   0x0f,   0x1f,   0x0e,   0x1e,   0x0d,   0x1d,
+        0x0c,   0x1c,   0x0b,   0x1b,   0x0a,   0x1a,   0x09,   0x19,
+        0x08,   0x18,   0x07,   0x17,   0x06,   0x16,   0x05,   0x15
+};
+static const struct mV_pos __initdata mobilevrm_mV[32] = {
+        {1750, 31},     {1700, 30},     {1650, 29},     {1600, 28},
+        {1550, 27},     {1500, 26},     {1450, 25},     {1400, 24},
+        {1350, 23},     {1300, 22},     {1250, 21},     {1200, 20},
+        {1150, 19},     {1100, 18},     {1050, 17},     {1000, 16},
+        {975, 15},      {950, 14},      {925, 13},      {900, 12},
+        {875, 11},      {850, 10},      {825, 9},       {800, 8},
+        {775, 7},       {750, 6},       {725, 5},       {700, 4},
+        {675, 3},       {650, 2},       {625, 1},       {600, 0}
+};
+static const unsigned char __initdata mV_mobilevrm[32] = {
+        0x1f,   0x1e,   0x1d,   0x1c,   0x1b,   0x1a,   0x19,   0x18,
+        0x17,   0x16,   0x15,   0x14,   0x13,   0x12,   0x11,   0x10,
+        0x0f,   0x0e,   0x0d,   0x0c,   0x0b,   0x0a,   0x09,   0x08,
+        0x07,   0x06,   0x05,   0x04,   0x03,   0x02,   0x01,   0x00
+};
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
new file mode 100644
index 000000000000..b2689514295a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -0,0 +1,325 @@
+/*
+ * (C) 2002 - 2003  Dominik Brodowski <linux@brodo.de>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/cpufreq.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/timex.h>
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longrun", msg)
+static struct cpufreq_driver    longrun_driver;
+/**
+ * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
+ * values into per cent values. In TMTA microcode, the following is valid:
+ * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
+ */
+static unsigned int longrun_low_freq, longrun_high_freq;
+/**
+ * longrun_get_policy - get the current LongRun policy
+ * @policy: struct cpufreq_policy where current policy is written into
+ *
+ * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
+ * and MSR_TMTA_LONGRUN_CTRL
+ */
+static void __init longrun_get_policy(struct cpufreq_policy *policy)
+{
+        u32 msr_lo, msr_hi;
+        rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
+        dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
+        if (msr_lo & 0x01)
+                policy->policy = CPUFREQ_POLICY_PERFORMANCE;
+        else
+                policy->policy = CPUFREQ_POLICY_POWERSAVE;
+        rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
+        dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
+        msr_lo &= 0x0000007F;
+        msr_hi &= 0x0000007F;
+        if ( longrun_high_freq <= longrun_low_freq ) {
+                /* Assume degenerate Longrun table */
+                policy->min = policy->max = longrun_high_freq;
+        } else {
+                policy->min = longrun_low_freq + msr_lo *
+                        ((longrun_high_freq - longrun_low_freq) / 100);
+                policy->max = longrun_low_freq + msr_hi *
+                        ((longrun_high_freq - longrun_low_freq) / 100);
+        }
+        policy->cpu = 0;
+}
+/**
+ * longrun_set_policy - sets a new CPUFreq policy
+ * @policy: new policy
+ *
+ * Sets a new CPUFreq policy on LongRun-capable processors. This function
+ * has to be called with cpufreq_driver locked.
+ */
+static int longrun_set_policy(struct cpufreq_policy *policy)
+{
+        u32 msr_lo, msr_hi;
+        u32 pctg_lo, pctg_hi;
+        if (!policy)
+                return -EINVAL;
+        if ( longrun_high_freq <= longrun_low_freq ) {
+                /* Assume degenerate Longrun table */
+                pctg_lo = pctg_hi = 100;
+        } else {
+                pctg_lo = (policy->min - longrun_low_freq) /
+                        ((longrun_high_freq - longrun_low_freq) / 100);
+                pctg_hi = (policy->max - longrun_low_freq) /
+                        ((longrun_high_freq - longrun_low_freq) / 100);
+        }
+        if (pctg_hi > 100)
+                pctg_hi = 100;
+        if (pctg_lo > pctg_hi)
+                pctg_lo = pctg_hi;
+        /* performance or economy mode */
+        rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
+        msr_lo &= 0xFFFFFFFE;
+        switch (policy->policy) {
+        case CPUFREQ_POLICY_PERFORMANCE:
+                msr_lo |= 0x00000001;
+                break;
+        case CPUFREQ_POLICY_POWERSAVE:
+                break;
+        }
+        wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
+        /* lower and upper boundary */
+        rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
+        msr_lo &= 0xFFFFFF80;
+        msr_hi &= 0xFFFFFF80;
+        msr_lo |= pctg_lo;
+        msr_hi |= pctg_hi;
+        wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
+        return 0;
+}
+/**
+ * longrun_verify_poliy - verifies a new CPUFreq policy
+ * @policy: the policy to verify
+ *
+ * Validates a new CPUFreq policy. This function has to be called with
+ * cpufreq_driver locked.
+ */
+static int longrun_verify_policy(struct cpufreq_policy *policy)
+{
+        if (!policy)
+                return -EINVAL;
+        policy->cpu = 0;
+        cpufreq_verify_within_limits(policy,
+                policy->cpuinfo.min_freq,
+                policy->cpuinfo.max_freq);
+        if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
+            (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
+                return -EINVAL;
+        return 0;
+}
+static unsigned int longrun_get(unsigned int cpu)
+{
+        u32 eax, ebx, ecx, edx;
+        if (cpu)
+                return 0;
+        cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
+        dprintk("cpuid eax is %u\n", eax);
+        return (eax * 1000);
+}
+/**
+ * longrun_determine_freqs - determines the lowest and highest possible core frequency
+ * @low_freq: an int to put the lowest frequency into
+ * @high_freq: an int to put the highest frequency into
+ *
+ * Determines the lowest and highest possible core frequencies on this CPU.
+ * This is necessary to calculate the performance percentage according to
+ * TMTA rules:
+ * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
+ */
+static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
+                                                   unsigned int *high_freq)
+{
+        u32 msr_lo, msr_hi;
+        u32 save_lo, save_hi;
+        u32 eax, ebx, ecx, edx;
+        u32 try_hi;
+        struct cpuinfo_x86 *c = cpu_data;
+        if (!low_freq || !high_freq)
+                return -EINVAL;
+        if (cpu_has(c, X86_FEATURE_LRTI)) {
+                /* if the LongRun Table Interface is present, the
+                 * detection is a bit easier:
+                 * For minimum frequency, read out the maximum
+                 * level (msr_hi), write that into "currently
+                 * selected level", and read out the frequency.
+                 * For maximum frequency, read out level zero.
+                 */
+                /* minimum */
+                rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
+                wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
+                rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
+                *low_freq = msr_lo * 1000; /* to kHz */
+                /* maximum */
+                wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
+                rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
+                *high_freq = msr_lo * 1000; /* to kHz */
+                dprintk("longrun table interface told %u - %u kHz\n", *low_freq, *high_freq);
+                if (*low_freq > *high_freq)
+                        *low_freq = *high_freq;
+                return 0;
+        }
+        /* set the upper border to the value determined during TSC init */
+        *high_freq = (cpu_khz / 1000);
+        *high_freq = *high_freq * 1000;
+        dprintk("high frequency is %u kHz\n", *high_freq);
+        /* get current borders */
+        rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
+        save_lo = msr_lo & 0x0000007F;
+        save_hi = msr_hi & 0x0000007F;
+        /* if current perf_pctg is larger than 90%, we need to decrease the
+         * upper limit to make the calculation more accurate.
+         */
+        cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
+        /* try decreasing in 10% steps, some processors react only
+         * on some barrier values */
+        for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -=10) {
+                /* set to 0 to try_hi perf_pctg */
+                msr_lo &= 0xFFFFFF80;
+                msr_hi &= 0xFFFFFF80;
+                msr_hi |= try_hi;
+                wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
+                /* read out current core MHz and current perf_pctg */
+                cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
+                /* restore values */
+                wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
+        }
+        dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
+        /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
+         * eqals
+         * low_freq * ( 1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
+         *
+         * high_freq * perf_pctg is stored tempoarily into "ebx".
+         */
+        ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
+        if ((ecx > 95) || (ecx == 0) || (eax < ebx))
+                return -EIO;
+        edx = (eax - ebx) / (100 - ecx);
+        *low_freq = edx * 1000; /* back to kHz */
+        dprintk("low frequency is %u kHz\n", *low_freq);
+        if (*low_freq > *high_freq)
+                *low_freq = *high_freq;
+        return 0;
+}
+static int __init longrun_cpu_init(struct cpufreq_policy *policy)
+{
+        int result = 0;
+        /* capability check */
+        if (policy->cpu != 0)
+                return -ENODEV;
+        /* detect low and high frequency */
+        result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
+        if (result)
+                return result;
+        /* cpuinfo and default policy values */
+        policy->cpuinfo.min_freq = longrun_low_freq;
+        policy->cpuinfo.max_freq = longrun_high_freq;
+        policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+        longrun_get_policy(policy);
+        return 0;
+}
+static struct cpufreq_driver longrun_driver = {
+        .flags          = CPUFREQ_CONST_LOOPS,
+        .verify         = longrun_verify_policy,
+        .setpolicy      = longrun_set_policy,
+        .get            = longrun_get,
+        .init           = longrun_cpu_init,
+        .name           = "longrun",
+        .owner          = THIS_MODULE,
+};
+/**
+ * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
+ *
+ * Initializes the LongRun support.
+ */
+static int __init longrun_init(void)
+{
+        struct cpuinfo_x86 *c = cpu_data;
+        if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
+            !cpu_has(c, X86_FEATURE_LONGRUN))
+                return -ENODEV;
+        return cpufreq_register_driver(&longrun_driver);
+}
+/**
+ * longrun_exit - unregisters LongRun support
+ */
+static void __exit longrun_exit(void)
+{
+        cpufreq_unregister_driver(&longrun_driver);
+}
+MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>");
+MODULE_DESCRIPTION ("LongRun driver for Transmeta Crusoe and Efficeon processors.");
+MODULE_LICENSE ("GPL");
+module_init(longrun_init);
+module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
new file mode 100644
index 000000000000..4c76b511e194
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -0,0 +1,316 @@
+/*
+ *      Pentium 4/Xeon CPU on demand clock modulation/speed scaling
+ *      (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
+ *      (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
+ *      (C) 2002 Arjan van de Ven <arjanv@redhat.com>
+ *      (C) 2002 Tora T. Engstad
+ *      All Rights Reserved
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *      The author(s) of this software shall not be held liable for damages
+ *      of any nature resulting due to the use of this software. This
+ *      software is provided AS-IS with no warranties.
+ *
+ *      Date            Errata                  Description
+ *      20020525        N44, O17        12.5% or 25% DC causes lockup
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+#include <linux/cpumask.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/timex.h>
+#include "speedstep-lib.h"
+#define PFX     "p4-clockmod: "
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "p4-clockmod", msg)
+/*
+ * Duty Cycle (3bits), note DC_DISABLE is not specified in
+ * intel docs i just use it to mean disable
+ */
+enum {
+        DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
+        DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
+};
+#define DC_ENTRIES      8
+static int has_N44_O17_errata[NR_CPUS];
+static unsigned int stock_freq;
+static struct cpufreq_driver p4clockmod_driver;
+static unsigned int cpufreq_p4_get(unsigned int cpu);
+static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
+{
+        u32 l, h;
+        if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV))
+                return -EINVAL;
+        rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
+        if (l & 0x01)
+                dprintk("CPU#%d currently thermal throttled\n", cpu);
+        if (has_N44_O17_errata[cpu] && (newstate == DC_25PT || newstate == DC_DFLT))
+                newstate = DC_38PT;
+        rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
+        if (newstate == DC_DISABLE) {
+                dprintk("CPU#%d disabling modulation\n", cpu);
+                wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
+        } else {
+                dprintk("CPU#%d setting duty cycle to %d%%\n",
+                        cpu, ((125 * newstate) / 10));
+                /* bits 63 - 5  : reserved
+                 * bit  4       : enable/disable
+                 * bits 3-1     : duty cycle
+                 * bit  0       : reserved
+                 */
+                l = (l & ~14);
+                l = l | (1<<4) | ((newstate & 0x7)<<1);
+                wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
+        }
+        return 0;
+}
+static struct cpufreq_frequency_table p4clockmod_table[] = {
+        {DC_RESV, CPUFREQ_ENTRY_INVALID},
+        {DC_DFLT, 0},
+        {DC_25PT, 0},
+        {DC_38PT, 0},
+        {DC_50PT, 0},
+        {DC_64PT, 0},
+        {DC_75PT, 0},
+        {DC_88PT, 0},
+        {DC_DISABLE, 0},
+        {DC_RESV, CPUFREQ_TABLE_END},
+};
+static int cpufreq_p4_target(struct cpufreq_policy *policy,
+                             unsigned int target_freq,
+                             unsigned int relation)
+{
+        unsigned int    newstate = DC_RESV;
+        struct cpufreq_freqs freqs;
+        int i;
+        if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], target_freq, relation, &newstate))
+                return -EINVAL;
+        freqs.old = cpufreq_p4_get(policy->cpu);
+        freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
+        if (freqs.new == freqs.old)
+                return 0;
+        /* notifiers */
+        for_each_cpu_mask(i, policy->cpus) {
+                freqs.cpu = i;
+                cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        }
+        /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
+         * Developer's Manual, Volume 3
+         */
+        for_each_cpu_mask(i, policy->cpus)
+                cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
+        /* notifiers */
+        for_each_cpu_mask(i, policy->cpus) {
+                freqs.cpu = i;
+                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+        }
+        return 0;
+}
+static int cpufreq_p4_verify(struct cpufreq_policy *policy)
+{
+        return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
+}
+static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
+{
+        if (c->x86 == 0x06) {
+                if (cpu_has(c, X86_FEATURE_EST))
+                        printk(KERN_WARNING PFX "Warning: EST-capable CPU detected. "
+                               "The acpi-cpufreq module offers voltage scaling"
+                               " in addition of frequency scaling. You should use "
+                               "that instead of p4-clockmod, if possible.\n");
+                switch (c->x86_model) {
+                case 0x0E: /* Core */
+                case 0x0F: /* Core Duo */
+                        p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
+                        return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PCORE);
+                case 0x0D: /* Pentium M (Dothan) */
+                        p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
+                        /* fall through */
+                case 0x09: /* Pentium M (Banias) */
+                        return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PM);
+                }
+        }
+        if (c->x86 != 0xF) {
+                printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n");
+                return 0;
+        }
+        /* on P-4s, the TSC runs with constant frequency independent whether
+         * throttling is active or not. */
+        p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
+        if (speedstep_detect_processor() == SPEEDSTEP_PROCESSOR_P4M) {
+                printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
+                       "The speedstep-ich or acpi cpufreq modules offer "
+                       "voltage scaling in addition of frequency scaling. "
+                       "You should use either one instead of p4-clockmod, "
+                       "if possible.\n");
+                return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4M);
+        }
+        return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4D);
+}
+static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
+{
+        struct cpuinfo_x86 *c = &cpu_data[policy->cpu];
+        int cpuid = 0;
+        unsigned int i;
+#ifdef CONFIG_SMP
+        policy->cpus = cpu_sibling_map[policy->cpu];
+#endif
+        /* Errata workaround */
+        cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
+        switch (cpuid) {
+        case 0x0f07:
+        case 0x0f0a:
+        case 0x0f11:
+        case 0x0f12:
+                has_N44_O17_errata[policy->cpu] = 1;
+                dprintk("has errata -- disabling low frequencies\n");
+        }
+        /* get max frequency */
+        stock_freq = cpufreq_p4_get_frequency(c);
+        if (!stock_freq)
+                return -EINVAL;
+        /* table init */
+        for (i=1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
+                if ((i<2) && (has_N44_O17_errata[policy->cpu]))
+                        p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+                else
+                        p4clockmod_table[i].frequency = (stock_freq * i)/8;
+        }
+        cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
+        /* cpuinfo and default policy values */
+        policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        policy->cpuinfo.transition_latency = 1000000; /* assumed */
+        policy->cur = stock_freq;
+        return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
+}
+static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
+{
+        cpufreq_frequency_table_put_attr(policy->cpu);
+        return 0;
+}
+static unsigned int cpufreq_p4_get(unsigned int cpu)
+{
+        u32 l, h;
+        rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
+        if (l & 0x10) {
+                l = l >> 1;
+                l &= 0x7;
+        } else
+                l = DC_DISABLE;
+        if (l != DC_DISABLE)
+                return (stock_freq * l / 8);
+        return stock_freq;
+}
+static struct freq_attr* p4clockmod_attr[] = {
+        &cpufreq_freq_attr_scaling_available_freqs,
+        NULL,
+};
+static struct cpufreq_driver p4clockmod_driver = {
+        .verify         = cpufreq_p4_verify,
+        .target         = cpufreq_p4_target,
+        .init           = cpufreq_p4_cpu_init,
+        .exit           = cpufreq_p4_cpu_exit,
+        .get            = cpufreq_p4_get,
+        .name           = "p4-clockmod",
+        .owner          = THIS_MODULE,
+        .attr           = p4clockmod_attr,
+};
+static int __init cpufreq_p4_init(void)
+{
+        struct cpuinfo_x86 *c = cpu_data;
+        int ret;
+        /*
+         * THERM_CONTROL is architectural for IA32 now, so
+         * we can rely on the capability checks
+         */
+        if (c->x86_vendor != X86_VENDOR_INTEL)
+                return -ENODEV;
+        if (!test_bit(X86_FEATURE_ACPI, c->x86_capability) ||
+                !test_bit(X86_FEATURE_ACC, c->x86_capability))
+                return -ENODEV;
+        ret = cpufreq_register_driver(&p4clockmod_driver);
+        if (!ret)
+                printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock Modulation available\n");
+        return (ret);
+}
+static void __exit cpufreq_p4_exit(void)
+{
+        cpufreq_unregister_driver(&p4clockmod_driver);
+}
+MODULE_AUTHOR ("Zwane Mwaikambo <zwane@commfireservices.com>");
+MODULE_DESCRIPTION ("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
+MODULE_LICENSE ("GPL");
+late_initcall(cpufreq_p4_init);
+module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
new file mode 100644
index 000000000000..f89524051e4a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -0,0 +1,256 @@
+/*
+ *  This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
+ *  (C) 2000-2003  Dave Jones, Arjan van de Ven, Janne P�nk�l�, Dominik Brodowski.
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/ioport.h>
+#include <linux/slab.h>
+#include <asm/msr.h>
+#include <asm/timex.h>
+#include <asm/io.h>
+#define POWERNOW_IOPORT 0xfff0         /* it doesn't matter where, as long
+                                          as it is unused */
+static unsigned int                     busfreq;   /* FSB, in 10 kHz */
+static unsigned int                     max_multiplier;
+/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
+static struct cpufreq_frequency_table clock_ratio[] = {
+        {45,  /* 000 -> 4.5x */ 0},
+        {50,  /* 001 -> 5.0x */ 0},
+        {40,  /* 010 -> 4.0x */ 0},
+        {55,  /* 011 -> 5.5x */ 0},
+        {20,  /* 100 -> 2.0x */ 0},
+        {30,  /* 101 -> 3.0x */ 0},
+        {60,  /* 110 -> 6.0x */ 0},
+        {35,  /* 111 -> 3.5x */ 0},
+        {0, CPUFREQ_TABLE_END}
+};
+/**
+ * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
+ *
+ *   Returns the current setting of the frequency multiplier. Core clock
+ * speed is frequency of the Front-Side Bus multiplied with this value.
+ */
+static int powernow_k6_get_cpu_multiplier(void)
+{
+        u64             invalue = 0;
+        u32             msrval;
+        msrval = POWERNOW_IOPORT + 0x1;
+        wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
+        invalue=inl(POWERNOW_IOPORT + 0x8);
+        msrval = POWERNOW_IOPORT + 0x0;
+        wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
+        return clock_ratio[(invalue >> 5)&7].index;
+}
+/**
+ * powernow_k6_set_state - set the PowerNow! multiplier
+ * @best_i: clock_ratio[best_i] is the target multiplier
+ *
+ *   Tries to change the PowerNow! multiplier
+ */
+static void powernow_k6_set_state (unsigned int best_i)
+{
+        unsigned long           outvalue=0, invalue=0;
+        unsigned long           msrval;
+        struct cpufreq_freqs    freqs;
+        if (clock_ratio[best_i].index > max_multiplier) {
+                printk(KERN_ERR "cpufreq: invalid target frequency\n");
+                return;
+        }
+        freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
+        freqs.new = busfreq * clock_ratio[best_i].index;
+        freqs.cpu = 0; /* powernow-k6.c is UP only driver */
+        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        /* we now need to transform best_i to the BVC format, see AMD#23446 */
+        outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
+        msrval = POWERNOW_IOPORT + 0x1;
+        wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
+        invalue=inl(POWERNOW_IOPORT + 0x8);
+        invalue = invalue & 0xf;
+        outvalue = outvalue | invalue;
+        outl(outvalue ,(POWERNOW_IOPORT + 0x8));
+        msrval = POWERNOW_IOPORT + 0x0;
+        wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
+        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+        return;
+}
+/**
+ * powernow_k6_verify - verifies a new CPUfreq policy
+ * @policy: new policy
+ *
+ * Policy must be within lowest and highest possible CPU Frequency,
+ * and at least one possible state must be within min and max.
+ */
+static int powernow_k6_verify(struct cpufreq_policy *policy)
+{
+        return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
+}
+/**
+ * powernow_k6_setpolicy - sets a new CPUFreq policy
+ * @policy: new policy
+ * @target_freq: the target frequency
+ * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
+ *
+ * sets a new CPUFreq policy
+ */
+static int powernow_k6_target (struct cpufreq_policy *policy,
+                               unsigned int target_freq,
+                               unsigned int relation)
+{
+        unsigned int    newstate = 0;
+        if (cpufreq_frequency_table_target(policy, &clock_ratio[0], target_freq, relation, &newstate))
+                return -EINVAL;
+        powernow_k6_set_state(newstate);
+        return 0;
+}
+static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
+{
+        unsigned int i;
+        int result;
+        if (policy->cpu != 0)
+                return -ENODEV;
+        /* get frequencies */
+        max_multiplier = powernow_k6_get_cpu_multiplier();
+        busfreq = cpu_khz / max_multiplier;
+        /* table init */
+        for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
+                if (clock_ratio[i].index > max_multiplier)
+                        clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
+                else
+                        clock_ratio[i].frequency = busfreq * clock_ratio[i].index;
+        }
+        /* cpuinfo and default policy values */
+        policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+        policy->cur = busfreq * max_multiplier;
+        result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
+        if (result)
+                return (result);
+        cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
+        return 0;
+}
+static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
+{
+        unsigned int i;
+        for (i=0; i<8; i++) {
+                if (i==max_multiplier)
+                        powernow_k6_set_state(i);
+        }
+        cpufreq_frequency_table_put_attr(policy->cpu);
+        return 0;
+}
+static unsigned int powernow_k6_get(unsigned int cpu)
+{
+        return busfreq * powernow_k6_get_cpu_multiplier();
+}
+static struct freq_attr* powernow_k6_attr[] = {
+        &cpufreq_freq_attr_scaling_available_freqs,
+        NULL,
+};
+static struct cpufreq_driver powernow_k6_driver = {
+        .verify         = powernow_k6_verify,
+        .target         = powernow_k6_target,
+        .init           = powernow_k6_cpu_init,
+        .exit           = powernow_k6_cpu_exit,
+        .get            = powernow_k6_get,
+        .name           = "powernow-k6",
+        .owner          = THIS_MODULE,
+        .attr           = powernow_k6_attr,
+};
+/**
+ * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
+ *
+ *   Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
+ * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
+ * on success.
+ */
+static int __init powernow_k6_init(void)
+{
+        struct cpuinfo_x86      *c = cpu_data;
+        if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
+                ((c->x86_model != 12) && (c->x86_model != 13)))
+                return -ENODEV;
+        if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
+                printk("cpufreq: PowerNow IOPORT region already used.\n");
+                return -EIO;
+        }
+        if (cpufreq_register_driver(&powernow_k6_driver)) {
+                release_region (POWERNOW_IOPORT, 16);
+                return -EINVAL;
+        }
+        return 0;
+}
+/**
+ * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
+ *
+ *   Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
+ */
+static void __exit powernow_k6_exit(void)
+{
+        cpufreq_unregister_driver(&powernow_k6_driver);
+        release_region (POWERNOW_IOPORT, 16);
+}
+MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
+MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
+MODULE_LICENSE ("GPL");
+module_init(powernow_k6_init);
+module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
new file mode 100644
index 000000000000..ca3e1d341889
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -0,0 +1,703 @@
+/*
+ *  AMD K7 Powernow driver.
+ *  (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs.
+ *  (C) 2003-2004 Dave Jones <davej@redhat.com>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *  Based upon datasheets & sample CPUs kindly provided by AMD.
+ *
+ * Errata 5: Processor may fail to execute a FID/VID change in presence of interrupt.
+ * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
+ * Errata 15: Processors with half frequency multipliers may hang upon wakeup from disconnect.
+ * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dmi.h>
+#include <asm/msr.h>
+#include <asm/timer.h>
+#include <asm/timex.h>
+#include <asm/io.h>
+#include <asm/system.h>
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
+#include <linux/acpi.h>
+#include <acpi/processor.h>
+#endif
+#include "powernow-k7.h"
+#define PFX "powernow: "
+struct psb_s {
+        u8 signature[10];
+        u8 tableversion;
+        u8 flags;
+        u16 settlingtime;
+        u8 reserved1;
+        u8 numpst;
+};
+struct pst_s {
+        u32 cpuid;
+        u8 fsbspeed;
+        u8 maxfid;
+        u8 startvid;
+        u8 numpstates;
+};
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
+union powernow_acpi_control_t {
+        struct {
+                unsigned long fid:5,
+                vid:5,
+                sgtc:20,
+                res1:2;
+        } bits;
+        unsigned long val;
+};
+#endif
+#ifdef CONFIG_CPU_FREQ_DEBUG
+/* divide by 1000 to get VCore voltage in V. */
+static const int mobile_vid_table[32] = {
+    2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
+    1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
+    1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
+    1075, 1050, 1025, 1000, 975, 950, 925, 0,
+};
+#endif
+/* divide by 10 to get FID. */
+static const int fid_codes[32] = {
+    110, 115, 120, 125, 50, 55, 60, 65,
+    70, 75, 80, 85, 90, 95, 100, 105,
+    30, 190, 40, 200, 130, 135, 140, 210,
+    150, 225, 160, 165, 170, 180, -1, -1,
+};
+/* This parameter is used in order to force ACPI instead of legacy method for
+ * configuration purpose.
+ */
+static int acpi_force;
+static struct cpufreq_frequency_table *powernow_table;
+static unsigned int can_scale_bus;
+static unsigned int can_scale_vid;
+static unsigned int minimum_speed=-1;
+static unsigned int maximum_speed;
+static unsigned int number_scales;
+static unsigned int fsb;
+static unsigned int latency;
+static char have_a0;
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k7", msg)
+static int check_fsb(unsigned int fsbspeed)
+{
+        int delta;
+        unsigned int f = fsb / 1000;
+        delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
+        return (delta < 5);
+}
+static int check_powernow(void)
+{
+        struct cpuinfo_x86 *c = cpu_data;
+        unsigned int maxei, eax, ebx, ecx, edx;
+        if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 !=6)) {
+#ifdef MODULE
+                printk (KERN_INFO PFX "This module only works with AMD K7 CPUs\n");
+#endif
+                return 0;
+        }
+        /* Get maximum capabilities */
+        maxei = cpuid_eax (0x80000000);
+        if (maxei < 0x80000007) {       /* Any powernow info ? */
+#ifdef MODULE
+                printk (KERN_INFO PFX "No powernow capabilities detected\n");
+#endif
+                return 0;
+        }
+        if ((c->x86_model == 6) && (c->x86_mask == 0)) {
+                printk (KERN_INFO PFX "K7 660[A0] core detected, enabling errata workarounds\n");
+                have_a0 = 1;
+        }
+        cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
+        /* Check we can actually do something before we say anything.*/
+        if (!(edx & (1 << 1 | 1 << 2)))
+                return 0;
+        printk (KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
+        if (edx & 1 << 1) {
+                printk ("frequency");
+                can_scale_bus=1;
+        }
+        if ((edx & (1 << 1 | 1 << 2)) == 0x6)
+                printk (" and ");
+        if (edx & 1 << 2) {
+                printk ("voltage");
+                can_scale_vid=1;
+        }
+        printk (".\n");
+        return 1;
+}
+static int get_ranges (unsigned char *pst)
+{
+        unsigned int j;
+        unsigned int speed;
+        u8 fid, vid;
+        powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * (number_scales + 1)), GFP_KERNEL);
+        if (!powernow_table)
+                return -ENOMEM;
+        for (j=0 ; j < number_scales; j++) {
+                fid = *pst++;
+                powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
+                powernow_table[j].index = fid; /* lower 8 bits */
+                speed = powernow_table[j].frequency;
+                if ((fid_codes[fid] % 10)==5) {
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
+                        if (have_a0 == 1)
+                                powernow_table[j].frequency = CPUFREQ_ENTRY_INVALID;
+#endif
+                }
+                if (speed < minimum_speed)
+                        minimum_speed = speed;
+                if (speed > maximum_speed)
+                        maximum_speed = speed;
+                vid = *pst++;
+                powernow_table[j].index |= (vid << 8); /* upper 8 bits */
+                dprintk ("   FID: 0x%x (%d.%dx [%dMHz])  "
+                         "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
+                         fid_codes[fid] % 10, speed/1000, vid,
+                         mobile_vid_table[vid]/1000,
+                         mobile_vid_table[vid]%1000);
+        }
+        powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
+        powernow_table[number_scales].index = 0;
+        return 0;
+}
+static void change_FID(int fid)
+{
+        union msr_fidvidctl fidvidctl;
+        rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
+        if (fidvidctl.bits.FID != fid) {
+                fidvidctl.bits.SGTC = latency;
+                fidvidctl.bits.FID = fid;
+                fidvidctl.bits.VIDC = 0;
+                fidvidctl.bits.FIDC = 1;
+                wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
+        }
+}
+static void change_VID(int vid)
+{
+        union msr_fidvidctl fidvidctl;
+        rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
+        if (fidvidctl.bits.VID != vid) {
+                fidvidctl.bits.SGTC = latency;
+                fidvidctl.bits.VID = vid;
+                fidvidctl.bits.FIDC = 0;
+                fidvidctl.bits.VIDC = 1;
+                wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val);
+        }
+}
+static void change_speed (unsigned int index)
+{
+        u8 fid, vid;
+        struct cpufreq_freqs freqs;
+        union msr_fidvidstatus fidvidstatus;
+        int cfid;
+        /* fid are the lower 8 bits of the index we stored into
+         * the cpufreq frequency table in powernow_decode_bios,
+         * vid are the upper 8 bits.
+         */
+        fid = powernow_table[index].index & 0xFF;
+        vid = (powernow_table[index].index & 0xFF00) >> 8;
+        freqs.cpu = 0;
+        rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val);
+        cfid = fidvidstatus.bits.CFID;
+        freqs.old = fsb * fid_codes[cfid] / 10;
+        freqs.new = powernow_table[index].frequency;
+        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        /* Now do the magic poking into the MSRs.  */
+        if (have_a0 == 1)       /* A0 errata 5 */
+                local_irq_disable();
+        if (freqs.old > freqs.new) {
+                /* Going down, so change FID first */
+                change_FID(fid);
+                change_VID(vid);
+        } else {
+                /* Going up, so change VID first */
+                change_VID(vid);
+                change_FID(fid);
+        }
+        if (have_a0 == 1)
+                local_irq_enable();
+        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+}
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
+static struct acpi_processor_performance *acpi_processor_perf;
+static int powernow_acpi_init(void)
+{
+        int i;
+        int retval = 0;
+        union powernow_acpi_control_t pc;
+        if (acpi_processor_perf != NULL && powernow_table != NULL) {
+                retval = -EINVAL;
+                goto err0;
+        }
+        acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
+                                      GFP_KERNEL);
+        if (!acpi_processor_perf) {
+                retval = -ENOMEM;
+                goto err0;
+        }
+        if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
+                retval = -EIO;
+                goto err1;
+        }
+        if (acpi_processor_perf->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) {
+                retval = -ENODEV;
+                goto err2;
+        }
+        if (acpi_processor_perf->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) {
+                retval = -ENODEV;
+                goto err2;
+        }
+        number_scales = acpi_processor_perf->state_count;
+        if (number_scales < 2) {
+                retval = -ENODEV;
+                goto err2;
+        }
+        powernow_table = kzalloc((number_scales + 1) * (sizeof(struct cpufreq_frequency_table)), GFP_KERNEL);
+        if (!powernow_table) {
+                retval = -ENOMEM;
+                goto err2;
+        }
+        pc.val = (unsigned long) acpi_processor_perf->states[0].control;
+        for (i = 0; i < number_scales; i++) {
+                u8 fid, vid;
+                struct acpi_processor_px *state =
+                        &acpi_processor_perf->states[i];
+                unsigned int speed, speed_mhz;
+                pc.val = (unsigned long) state->control;
+                dprintk ("acpi:  P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
+                         i,
+                         (u32) state->core_frequency,
+                         (u32) state->power,
+                         (u32) state->transition_latency,
+                         (u32) state->control,
+                         pc.bits.sgtc);
+                vid = pc.bits.vid;
+                fid = pc.bits.fid;
+                powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
+                powernow_table[i].index = fid; /* lower 8 bits */
+                powernow_table[i].index |= (vid << 8); /* upper 8 bits */
+                speed = powernow_table[i].frequency;
+                speed_mhz = speed / 1000;
+                /* processor_perflib will multiply the MHz value by 1000 to
+                 * get a KHz value (e.g. 1266000). However, powernow-k7 works
+                 * with true KHz values (e.g. 1266768). To ensure that all
+                 * powernow frequencies are available, we must ensure that
+                 * ACPI doesn't restrict them, so we round up the MHz value
+                 * to ensure that perflib's computed KHz value is greater than
+                 * or equal to powernow's KHz value.
+                 */
+                if (speed % 1000 > 0)
+                        speed_mhz++;
+                if ((fid_codes[fid] % 10)==5) {
+                        if (have_a0 == 1)
+                                powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+                }
+                dprintk ("   FID: 0x%x (%d.%dx [%dMHz])  "
+                         "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
+                         fid_codes[fid] % 10, speed_mhz, vid,
+                         mobile_vid_table[vid]/1000,
+                         mobile_vid_table[vid]%1000);
+                if (state->core_frequency != speed_mhz) {
+                        state->core_frequency = speed_mhz;
+                        dprintk("   Corrected ACPI frequency to %d\n",
+                                speed_mhz);
+                }
+                if (latency < pc.bits.sgtc)
+                        latency = pc.bits.sgtc;
+                if (speed < minimum_speed)
+                        minimum_speed = speed;
+                if (speed > maximum_speed)
+                        maximum_speed = speed;
+        }
+        powernow_table[i].frequency = CPUFREQ_TABLE_END;
+        powernow_table[i].index = 0;
+        /* notify BIOS that we exist */
+        acpi_processor_notify_smm(THIS_MODULE);
+        return 0;
+err2:
+        acpi_processor_unregister_performance(acpi_processor_perf, 0);
+err1:
+        kfree(acpi_processor_perf);
+err0:
+        printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n");
+        acpi_processor_perf = NULL;
+        return retval;
+}
+#else
+static int powernow_acpi_init(void)
+{
+        printk(KERN_INFO PFX "no support for ACPI processor found."
+               "  Please recompile your kernel with ACPI processor\n");
+        return -EINVAL;
+}
+#endif
+static int powernow_decode_bios (int maxfid, int startvid)
+{
+        struct psb_s *psb;
+        struct pst_s *pst;
+        unsigned int i, j;
+        unsigned char *p;
+        unsigned int etuple;
+        unsigned int ret;
+        etuple = cpuid_eax(0x80000001);
+        for (i=0xC0000; i < 0xffff0 ; i+=16) {
+                p = phys_to_virt(i);
+                if (memcmp(p, "AMDK7PNOW!",  10) == 0){
+                        dprintk ("Found PSB header at %p\n", p);
+                        psb = (struct psb_s *) p;
+                        dprintk ("Table version: 0x%x\n", psb->tableversion);
+                        if (psb->tableversion != 0x12) {
+                                printk (KERN_INFO PFX "Sorry, only v1.2 tables supported right now\n");
+                                return -ENODEV;
+                        }
+                        dprintk ("Flags: 0x%x\n", psb->flags);
+                        if ((psb->flags & 1)==0) {
+                                dprintk ("Mobile voltage regulator\n");
+                        } else {
+                                dprintk ("Desktop voltage regulator\n");
+                        }
+                        latency = psb->settlingtime;
+                        if (latency < 100) {
+                                printk (KERN_INFO PFX "BIOS set settling time to %d microseconds."
+                                                "Should be at least 100. Correcting.\n", latency);
+                                latency = 100;
+                        }
+                        dprintk ("Settling Time: %d microseconds.\n", psb->settlingtime);
+                        dprintk ("Has %d PST tables. (Only dumping ones relevant to this CPU).\n", psb->numpst);
+                        p += sizeof (struct psb_s);
+                        pst = (struct pst_s *) p;
+                        for (j=0; j<psb->numpst; j++) {
+                                pst = (struct pst_s *) p;
+                                number_scales = pst->numpstates;
+                                if ((etuple == pst->cpuid) && check_fsb(pst->fsbspeed) &&
+                                    (maxfid==pst->maxfid) && (startvid==pst->startvid))
+                                {
+                                        dprintk ("PST:%d (@%p)\n", j, pst);
+                                        dprintk (" cpuid: 0x%x  fsb: %d  maxFID: 0x%x  startvid: 0x%x\n",
+                                                 pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
+                                        ret = get_ranges ((char *) pst + sizeof (struct pst_s));
+                                        return ret;
+                                } else {
+                                        unsigned int k;
+                                        p = (char *) pst + sizeof (struct pst_s);
+                                        for (k=0; k<number_scales; k++)
+                                                p+=2;
+                                }
+                        }
+                        printk (KERN_INFO PFX "No PST tables match this cpuid (0x%x)\n", etuple);
+                        printk (KERN_INFO PFX "This is indicative of a broken BIOS.\n");
+                        return -EINVAL;
+                }
+                p++;
+        }
+        return -ENODEV;
+}
+static int powernow_target (struct cpufreq_policy *policy,
+                            unsigned int target_freq,
+                            unsigned int relation)
+{
+        unsigned int newstate;
+        if (cpufreq_frequency_table_target(policy, powernow_table, target_freq, relation, &newstate))
+                return -EINVAL;
+        change_speed(newstate);
+        return 0;
+}
+static int powernow_verify (struct cpufreq_policy *policy)
+{
+        return cpufreq_frequency_table_verify(policy, powernow_table);
+}
+/*
+ * We use the fact that the bus frequency is somehow
+ * a multiple of 100000/3 khz, then we compute sgtc according
+ * to this multiple.
+ * That way, we match more how AMD thinks all of that work.
+ * We will then get the same kind of behaviour already tested under
+ * the "well-known" other OS.
+ */
+static int __init fixup_sgtc(void)
+{
+        unsigned int sgtc;
+        unsigned int m;
+        m = fsb / 3333;
+        if ((m % 10) >= 5)
+                m += 5;
+        m /= 10;
+        sgtc = 100 * m * latency;
+        sgtc = sgtc / 3;
+        if (sgtc > 0xfffff) {
+                printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
+                sgtc = 0xfffff;
+        }
+        return sgtc;
+}
+static unsigned int powernow_get(unsigned int cpu)
+{
+        union msr_fidvidstatus fidvidstatus;
+        unsigned int cfid;
+        if (cpu)
+                return 0;
+        rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val);
+        cfid = fidvidstatus.bits.CFID;
+        return (fsb * fid_codes[cfid] / 10);
+}
+static int __init acer_cpufreq_pst(struct dmi_system_id *d)
+{
+        printk(KERN_WARNING "%s laptop with broken PST tables in BIOS detected.\n", d->ident);
+        printk(KERN_WARNING "You need to downgrade to 3A21 (09/09/2002), or try a newer BIOS than 3A71 (01/20/2003)\n");
+        printk(KERN_WARNING "cpufreq scaling has been disabled as a result of this.\n");
+        return 0;
+}
+/*
+ * Some Athlon laptops have really fucked PST tables.
+ * A BIOS update is all that can save them.
+ * Mention this, and disable cpufreq.
+ */
+static struct dmi_system_id __initdata powernow_dmi_table[] = {
+        {
+                .callback = acer_cpufreq_pst,
+                .ident = "Acer Aspire",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
+                },
+        },
+        { }
+};
+static int __init powernow_cpu_init (struct cpufreq_policy *policy)
+{
+        union msr_fidvidstatus fidvidstatus;
+        int result;
+        if (policy->cpu != 0)
+                return -ENODEV;
+        rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val);
+        recalibrate_cpu_khz();
+        fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
+        if (!fsb) {
+                printk(KERN_WARNING PFX "can not determine bus frequency\n");
+                return -EINVAL;
+        }
+        dprintk("FSB: %3dMHz\n", fsb/1000);
+        if (dmi_check_system(powernow_dmi_table) || acpi_force) {
+                printk (KERN_INFO PFX "PSB/PST known to be broken.  Trying ACPI instead\n");
+                result = powernow_acpi_init();
+        } else {
+                result = powernow_decode_bios(fidvidstatus.bits.MFID, fidvidstatus.bits.SVID);
+                if (result) {
+                        printk (KERN_INFO PFX "Trying ACPI perflib\n");
+                        maximum_speed = 0;
+                        minimum_speed = -1;
+                        latency = 0;
+                        result = powernow_acpi_init();
+                        if (result) {
+                                printk (KERN_INFO PFX "ACPI and legacy methods failed\n");
+                                printk (KERN_INFO PFX "See http://www.codemonkey.org.uk/projects/cpufreq/powernow-k7.html\n");
+                        }
+                } else {
+                        /* SGTC use the bus clock as timer */
+                        latency = fixup_sgtc();
+                        printk(KERN_INFO PFX "SGTC: %d\n", latency);
+                }
+        }
+        if (result)
+                return result;
+        printk (KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
+                                minimum_speed/1000, maximum_speed/1000);
+        policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        policy->cpuinfo.transition_latency = cpufreq_scale(2000000UL, fsb, latency);
+        policy->cur = powernow_get(0);
+        cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
+        return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
+}
+static int powernow_cpu_exit (struct cpufreq_policy *policy) {
+        cpufreq_frequency_table_put_attr(policy->cpu);
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
+        if (acpi_processor_perf) {
+                acpi_processor_unregister_performance(acpi_processor_perf, 0);
+                kfree(acpi_processor_perf);
+        }
+#endif
+        kfree(powernow_table);
+        return 0;
+}
+static struct freq_attr* powernow_table_attr[] = {
+        &cpufreq_freq_attr_scaling_available_freqs,
+        NULL,
+};
+static struct cpufreq_driver powernow_driver = {
+        .verify = powernow_verify,
+        .target = powernow_target,
+        .get    = powernow_get,
+        .init   = powernow_cpu_init,
+        .exit   = powernow_cpu_exit,
+        .name   = "powernow-k7",
+        .owner  = THIS_MODULE,
+        .attr   = powernow_table_attr,
+};
+static int __init powernow_init (void)
+{
+        if (check_powernow()==0)
+                return -ENODEV;
+        return cpufreq_register_driver(&powernow_driver);
+}
+static void __exit powernow_exit (void)
+{
+        cpufreq_unregister_driver(&powernow_driver);
+}
+module_param(acpi_force,  int, 0444);
+MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
+MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors.");
+MODULE_LICENSE ("GPL");
+late_initcall(powernow_init);
+module_exit(powernow_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
new file mode 100644
index 000000000000..f8a63b3664e3
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
@@ -0,0 +1,44 @@
+/*
+ *  $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $
+ *  (C) 2003 Dave Jones.
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  AMD-specific information
+ *
+ */
+union msr_fidvidctl {
+        struct {
+                unsigned FID:5,                 // 4:0
+                reserved1:3,    // 7:5
+                VID:5,                  // 12:8
+                reserved2:3,    // 15:13
+                FIDC:1,                 // 16
+                VIDC:1,                 // 17
+                reserved3:2,    // 19:18
+                FIDCHGRATIO:1,  // 20
+                reserved4:11,   // 31-21
+                SGTC:20,                // 32:51
+                reserved5:12;   // 63:52
+        } bits;
+        unsigned long long val;
+};
+union msr_fidvidstatus {
+        struct {
+                unsigned CFID:5,                        // 4:0
+                reserved1:3,    // 7:5
+                SFID:5,                 // 12:8
+                reserved2:3,    // 15:13
+                MFID:5,                 // 20:16
+                reserved3:11,   // 31:21
+                CVID:5,                 // 36:32
+                reserved4:3,    // 39:37
+                SVID:5,                 // 44:40
+                reserved5:3,    // 47:45
+                MVID:5,                 // 52:48
+                reserved6:11;   // 63:53
+        } bits;
+        unsigned long long val;
+};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
new file mode 100644
index 000000000000..34ed53a06730
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -0,0 +1,1363 @@
+/*
+ *   (c) 2003-2006 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ *
+ *  Support : mark.langsdorf@amd.com
+ *
+ *  Based on the powernow-k7.c module written by Dave Jones.
+ *  (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs
+ *  (C) 2004 Dominik Brodowski <linux@brodo.de>
+ *  (C) 2004 Pavel Machek <pavel@suse.cz>
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *  Based upon datasheets & sample CPUs kindly provided by AMD.
+ *
+ *  Valuable input gratefully received from Dave Jones, Pavel Machek,
+ *  Dominik Brodowski, Jacob Shin, and others.
+ *  Originally developed by Paul Devriendt.
+ *  Processor information obtained from Chapter 9 (Power and Thermal Management)
+ *  of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
+ *  Opteron Processors" available for download from www.amd.com
+ *
+ *  Tables for specific CPUs can be inferred from
+ *     http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
+ */
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/cpumask.h>
+#include <linux/sched.h>        /* for current / set_cpus_allowed() */
+#include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/delay.h>
+#ifdef CONFIG_X86_POWERNOW_K8_ACPI
+#include <linux/acpi.h>
+#include <linux/mutex.h>
+#include <acpi/processor.h>
+#endif
+#define PFX "powernow-k8: "
+#define BFX PFX "BIOS error: "
+#define VERSION "version 2.00.00"
+#include "powernow-k8.h"
+/* serialize freq changes  */
+static DEFINE_MUTEX(fidvid_mutex);
+static struct powernow_k8_data *powernow_data[NR_CPUS];
+static int cpu_family = CPU_OPTERON;
+#ifndef CONFIG_SMP
+static cpumask_t cpu_core_map[1];
+#endif
+/* Return a frequency in MHz, given an input fid */
+static u32 find_freq_from_fid(u32 fid)
+{
+        return 800 + (fid * 100);
+}
+/* Return a frequency in KHz, given an input fid */
+static u32 find_khz_freq_from_fid(u32 fid)
+{
+        return 1000 * find_freq_from_fid(fid);
+}
+/* Return a frequency in MHz, given an input fid and did */
+static u32 find_freq_from_fiddid(u32 fid, u32 did)
+{
+        return 100 * (fid + 0x10) >> did;
+}
+static u32 find_khz_freq_from_fiddid(u32 fid, u32 did)
+{
+        return 1000 * find_freq_from_fiddid(fid, did);
+}
+static u32 find_fid_from_pstate(u32 pstate)
+{
+        u32 hi, lo;
+        rdmsr(MSR_PSTATE_DEF_BASE + pstate, lo, hi);
+        return lo & HW_PSTATE_FID_MASK;
+}
+static u32 find_did_from_pstate(u32 pstate)
+{
+        u32 hi, lo;
+        rdmsr(MSR_PSTATE_DEF_BASE + pstate, lo, hi);
+        return (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT;
+}
+/* Return the vco fid for an input fid
+ *
+ * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
+ * only from corresponding high fids. This returns "high" fid corresponding to
+ * "low" one.
+ */
+static u32 convert_fid_to_vco_fid(u32 fid)
+{
+        if (fid < HI_FID_TABLE_BOTTOM)
+                return 8 + (2 * fid);
+        else
+                return fid;
+}
+/*
+ * Return 1 if the pending bit is set. Unless we just instructed the processor
+ * to transition to a new state, seeing this bit set is really bad news.
+ */
+static int pending_bit_stuck(void)
+{
+        u32 lo, hi;
+        if (cpu_family == CPU_HW_PSTATE)
+                return 0;
+        rdmsr(MSR_FIDVID_STATUS, lo, hi);
+        return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
+}
+/*
+ * Update the global current fid / vid values from the status msr.
+ * Returns 1 on error.
+ */
+static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
+{
+        u32 lo, hi;
+        u32 i = 0;
+        if (cpu_family == CPU_HW_PSTATE) {
+                rdmsr(MSR_PSTATE_STATUS, lo, hi);
+                i = lo & HW_PSTATE_MASK;
+                rdmsr(MSR_PSTATE_DEF_BASE + i, lo, hi);
+                data->currfid = lo & HW_PSTATE_FID_MASK;
+                data->currdid = (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT;
+                return 0;
+        }
+        do {
+                if (i++ > 10000) {
+                        dprintk("detected change pending stuck\n");
+                        return 1;
+                }
+                rdmsr(MSR_FIDVID_STATUS, lo, hi);
+        } while (lo & MSR_S_LO_CHANGE_PENDING);
+        data->currvid = hi & MSR_S_HI_CURRENT_VID;
+        data->currfid = lo & MSR_S_LO_CURRENT_FID;
+        return 0;
+}
+/* the isochronous relief time */
+static void count_off_irt(struct powernow_k8_data *data)
+{
+        udelay((1 << data->irt) * 10);
+        return;
+}
+/* the voltage stabalization time */
+static void count_off_vst(struct powernow_k8_data *data)
+{
+        udelay(data->vstable * VST_UNITS_20US);
+        return;
+}
+/* need to init the control msr to a safe value (for each cpu) */
+static void fidvid_msr_init(void)
+{
+        u32 lo, hi;
+        u8 fid, vid;
+        rdmsr(MSR_FIDVID_STATUS, lo, hi);
+        vid = hi & MSR_S_HI_CURRENT_VID;
+        fid = lo & MSR_S_LO_CURRENT_FID;
+        lo = fid | (vid << MSR_C_LO_VID_SHIFT);
+        hi = MSR_C_HI_STP_GNT_BENIGN;
+        dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
+        wrmsr(MSR_FIDVID_CTL, lo, hi);
+}
+/* write the new fid value along with the other control fields to the msr */
+static int write_new_fid(struct powernow_k8_data *data, u32 fid)
+{
+        u32 lo;
+        u32 savevid = data->currvid;
+        u32 i = 0;
+        if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
+                printk(KERN_ERR PFX "internal error - overflow on fid write\n");
+                return 1;
+        }
+        lo = fid | (data->currvid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID;
+        dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
+                fid, lo, data->plllock * PLL_LOCK_CONVERSION);
+        do {
+                wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
+                if (i++ > 100) {
+                        printk(KERN_ERR PFX "Hardware error - pending bit very stuck - no further pstate changes possible\n");
+                        return 1;
+                }
+        } while (query_current_values_with_pending_wait(data));
+        count_off_irt(data);
+        if (savevid != data->currvid) {
+                printk(KERN_ERR PFX "vid change on fid trans, old 0x%x, new 0x%x\n",
+                       savevid, data->currvid);
+                return 1;
+        }
+        if (fid != data->currfid) {
+                printk(KERN_ERR PFX "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
+                        data->currfid);
+                return 1;
+        }
+        return 0;
+}
+/* Write a new vid to the hardware */
+static int write_new_vid(struct powernow_k8_data *data, u32 vid)
+{
+        u32 lo;
+        u32 savefid = data->currfid;
+        int i = 0;
+        if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
+                printk(KERN_ERR PFX "internal error - overflow on vid write\n");
+                return 1;
+        }
+        lo = data->currfid | (vid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID;
+        dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
+                vid, lo, STOP_GRANT_5NS);
+        do {
+                wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
+                if (i++ > 100) {
+                        printk(KERN_ERR PFX "internal error - pending bit very stuck - no further pstate changes possible\n");
+                        return 1;
+                }
+        } while (query_current_values_with_pending_wait(data));
+        if (savefid != data->currfid) {
+                printk(KERN_ERR PFX "fid changed on vid trans, old 0x%x new 0x%x\n",
+                       savefid, data->currfid);
+                return 1;
+        }
+        if (vid != data->currvid) {
+                printk(KERN_ERR PFX "vid trans failed, vid 0x%x, curr 0x%x\n", vid,
+                                data->currvid);
+                return 1;
+        }
+        return 0;
+}
+/*
+ * Reduce the vid by the max of step or reqvid.
+ * Decreasing vid codes represent increasing voltages:
+ * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
+ */
+static int decrease_vid_code_by_step(struct powernow_k8_data *data, u32 reqvid, u32 step)
+{
+        if ((data->currvid - reqvid) > step)
+                reqvid = data->currvid - step;
+        if (write_new_vid(data, reqvid))
+                return 1;
+        count_off_vst(data);
+        return 0;
+}
+/* Change hardware pstate by single MSR write */
+static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
+{
+        wrmsr(MSR_PSTATE_CTRL, pstate, 0);
+        data->currfid = find_fid_from_pstate(pstate);
+        return 0;
+}
+/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
+static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid)
+{
+        if (core_voltage_pre_transition(data, reqvid))
+                return 1;
+        if (core_frequency_transition(data, reqfid))
+                return 1;
+        if (core_voltage_post_transition(data, reqvid))
+                return 1;
+        if (query_current_values_with_pending_wait(data))
+                return 1;
+        if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
+                printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, curr 0x%x 0x%x\n",
+                                smp_processor_id(),
+                                reqfid, reqvid, data->currfid, data->currvid);
+                return 1;
+        }
+        dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
+                smp_processor_id(), data->currfid, data->currvid);
+        return 0;
+}
+/* Phase 1 - core voltage transition ... setup voltage */
+static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid)
+{
+        u32 rvosteps = data->rvo;
+        u32 savefid = data->currfid;
+        u32 maxvid, lo;
+        dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, reqvid 0x%x, rvo 0x%x\n",
+                smp_processor_id(),
+                data->currfid, data->currvid, reqvid, data->rvo);
+        rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
+        maxvid = 0x1f & (maxvid >> 16);
+        dprintk("ph1 maxvid=0x%x\n", maxvid);
+        if (reqvid < maxvid) /* lower numbers are higher voltages */
+                reqvid = maxvid;
+        while (data->currvid > reqvid) {
+                dprintk("ph1: curr 0x%x, req vid 0x%x\n",
+                        data->currvid, reqvid);
+                if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
+                        return 1;
+        }
+        while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) {
+                if (data->currvid == maxvid) {
+                        rvosteps = 0;
+                } else {
+                        dprintk("ph1: changing vid for rvo, req 0x%x\n",
+                                data->currvid - 1);
+                        if (decrease_vid_code_by_step(data, data->currvid - 1, 1))
+                                return 1;
+                        rvosteps--;
+                }
+        }
+        if (query_current_values_with_pending_wait(data))
+                return 1;
+        if (savefid != data->currfid) {
+                printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n", data->currfid);
+                return 1;
+        }
+        dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
+                data->currfid, data->currvid);
+        return 0;
+}
+/* Phase 2 - core frequency transition */
+static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
+{
+        u32 vcoreqfid, vcocurrfid, vcofiddiff, fid_interval, savevid = data->currvid;
+        if ((reqfid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) {
+                printk(KERN_ERR PFX "ph2: illegal lo-lo transition 0x%x 0x%x\n",
+                        reqfid, data->currfid);
+                return 1;
+        }
+        if (data->currfid == reqfid) {
+                printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid);
+                return 0;
+        }
+        dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, reqfid 0x%x\n",
+                smp_processor_id(),
+                data->currfid, data->currvid, reqfid);
+        vcoreqfid = convert_fid_to_vco_fid(reqfid);
+        vcocurrfid = convert_fid_to_vco_fid(data->currfid);
+        vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
+            : vcoreqfid - vcocurrfid;
+        while (vcofiddiff > 2) {
+                (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
+                if (reqfid > data->currfid) {
+                        if (data->currfid > LO_FID_TABLE_TOP) {
+                                if (write_new_fid(data, data->currfid + fid_interval)) {
+                                        return 1;
+                                }
+                        } else {
+                                if (write_new_fid
+                                    (data, 2 + convert_fid_to_vco_fid(data->currfid))) {
+                                        return 1;
+                                }
+                        }
+                } else {
+                        if (write_new_fid(data, data->currfid - fid_interval))
+                                return 1;
+                }
+                vcocurrfid = convert_fid_to_vco_fid(data->currfid);
+                vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
+                    : vcoreqfid - vcocurrfid;
+        }
+        if (write_new_fid(data, reqfid))
+                return 1;
+        if (query_current_values_with_pending_wait(data))
+                return 1;
+        if (data->currfid != reqfid) {
+                printk(KERN_ERR PFX
+                        "ph2: mismatch, failed fid transition, curr 0x%x, req 0x%x\n",
+                        data->currfid, reqfid);
+                return 1;
+        }
+        if (savevid != data->currvid) {
+                printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
+                        savevid, data->currvid);
+                return 1;
+        }
+        dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
+                data->currfid, data->currvid);
+        return 0;
+}
+/* Phase 3 - core voltage transition flow ... jump to the final vid. */
+static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid)
+{
+        u32 savefid = data->currfid;
+        u32 savereqvid = reqvid;
+        dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
+                smp_processor_id(),
+                data->currfid, data->currvid);
+        if (reqvid != data->currvid) {
+                if (write_new_vid(data, reqvid))
+                        return 1;
+                if (savefid != data->currfid) {
+                        printk(KERN_ERR PFX
+                               "ph3: bad fid change, save 0x%x, curr 0x%x\n",
+                               savefid, data->currfid);
+                        return 1;
+                }
+                if (data->currvid != reqvid) {
+                        printk(KERN_ERR PFX
+                               "ph3: failed vid transition\n, req 0x%x, curr 0x%x",
+                               reqvid, data->currvid);
+                        return 1;
+                }
+        }
+        if (query_current_values_with_pending_wait(data))
+                return 1;
+        if (savereqvid != data->currvid) {
+                dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
+                return 1;
+        }
+        if (savefid != data->currfid) {
+                dprintk("ph3 failed, currfid changed 0x%x\n",
+                        data->currfid);
+                return 1;
+        }
+        dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
+                data->currfid, data->currvid);
+        return 0;
+}
+static int check_supported_cpu(unsigned int cpu)
+{
+        cpumask_t oldmask = CPU_MASK_ALL;
+        u32 eax, ebx, ecx, edx;
+        unsigned int rc = 0;
+        oldmask = current->cpus_allowed;
+        set_cpus_allowed(current, cpumask_of_cpu(cpu));
+        if (smp_processor_id() != cpu) {
+                printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
+                goto out;
+        }
+        if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
+                goto out;
+        eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+        if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
+            ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
+                goto out;
+        if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
+                if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
+                    ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
+                        printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax);
+                        goto out;
+                }
+                eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
+                if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
+                        printk(KERN_INFO PFX
+                               "No frequency change capabilities detected\n");
+                        goto out;
+                }
+                cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
+                if ((edx & P_STATE_TRANSITION_CAPABLE) != P_STATE_TRANSITION_CAPABLE) {
+                        printk(KERN_INFO PFX "Power state transitions not supported\n");
+                        goto out;
+                }
+        } else { /* must be a HW Pstate capable processor */
+                cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
+                if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
+                        cpu_family = CPU_HW_PSTATE;
+                else
+                        goto out;
+        }
+        rc = 1;
+out:
+        set_cpus_allowed(current, oldmask);
+        return rc;
+}
+static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid)
+{
+        unsigned int j;
+        u8 lastfid = 0xff;
+        for (j = 0; j < data->numps; j++) {
+                if (pst[j].vid > LEAST_VID) {
+                        printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid);
+                        return -EINVAL;
+                }
+                if (pst[j].vid < data->rvo) {   /* vid + rvo >= 0 */
+                        printk(KERN_ERR BFX "0 vid exceeded with pstate %d\n", j);
+                        return -ENODEV;
+                }
+                if (pst[j].vid < maxvid + data->rvo) {  /* vid + rvo >= maxvid */
+                        printk(KERN_ERR BFX "maxvid exceeded with pstate %d\n", j);
+                        return -ENODEV;
+                }
+                if (pst[j].fid > MAX_FID) {
+                        printk(KERN_ERR BFX "maxfid exceeded with pstate %d\n", j);
+                        return -ENODEV;
+                }
+                if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
+                        /* Only first fid is allowed to be in "low" range */
+                        printk(KERN_ERR BFX "two low fids - %d : 0x%x\n", j, pst[j].fid);
+                        return -EINVAL;
+                }
+                if (pst[j].fid < lastfid)
+                        lastfid = pst[j].fid;
+        }
+        if (lastfid & 1) {
+                printk(KERN_ERR BFX "lastfid invalid\n");
+                return -EINVAL;
+        }
+        if (lastfid > LO_FID_TABLE_TOP)
+                printk(KERN_INFO BFX  "first fid not from lo freq table\n");
+        return 0;
+}
+static void print_basics(struct powernow_k8_data *data)
+{
+        int j;
+        for (j = 0; j < data->numps; j++) {
+                if (data->powernow_table[j].frequency != CPUFREQ_ENTRY_INVALID) {
+                        if (cpu_family == CPU_HW_PSTATE) {
+                                printk(KERN_INFO PFX "   %d : fid 0x%x did 0x%x (%d MHz)\n",
+                                        j,
+                                        (data->powernow_table[j].index & 0xff00) >> 8,
+                                        (data->powernow_table[j].index & 0xff0000) >> 16,
+                                        data->powernow_table[j].frequency/1000);
+                        } else {
+                                printk(KERN_INFO PFX "   %d : fid 0x%x (%d MHz), vid 0x%x\n",
+                                        j,
+                                        data->powernow_table[j].index & 0xff,
+                                        data->powernow_table[j].frequency/1000,
+                                        data->powernow_table[j].index >> 8);
+                        }
+                }
+        }
+        if (data->batps)
+                printk(KERN_INFO PFX "Only %d pstates on battery\n", data->batps);
+}
+static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid)
+{
+        struct cpufreq_frequency_table *powernow_table;
+        unsigned int j;
+        if (data->batps) {    /* use ACPI support to get full speed on mains power */
+                printk(KERN_WARNING PFX "Only %d pstates usable (use ACPI driver for full range\n", data->batps);
+                data->numps = data->batps;
+        }
+        for ( j=1; j<data->numps; j++ ) {
+                if (pst[j-1].fid >= pst[j].fid) {
+                        printk(KERN_ERR PFX "PST out of sequence\n");
+                        return -EINVAL;
+                }
+        }
+        if (data->numps < 2) {
+                printk(KERN_ERR PFX "no p states to transition\n");
+                return -ENODEV;
+        }
+        if (check_pst_table(data, pst, maxvid))
+                return -EINVAL;
+        powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
+                * (data->numps + 1)), GFP_KERNEL);
+        if (!powernow_table) {
+                printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
+                return -ENOMEM;
+        }
+        for (j = 0; j < data->numps; j++) {
+                powernow_table[j].index = pst[j].fid; /* lower 8 bits */
+                powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
+                powernow_table[j].frequency = find_khz_freq_from_fid(pst[j].fid);
+        }
+        powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
+        powernow_table[data->numps].index = 0;
+        if (query_current_values_with_pending_wait(data)) {
+                kfree(powernow_table);
+                return -EIO;
+        }
+        dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
+        data->powernow_table = powernow_table;
+        if (first_cpu(cpu_core_map[data->cpu]) == data->cpu)
+                print_basics(data);
+        for (j = 0; j < data->numps; j++)
+                if ((pst[j].fid==data->currfid) && (pst[j].vid==data->currvid))
+                        return 0;
+        dprintk("currfid/vid do not match PST, ignoring\n");
+        return 0;
+}
+/* Find and validate the PSB/PST table in BIOS. */
+static int find_psb_table(struct powernow_k8_data *data)
+{
+        struct psb_s *psb;
+        unsigned int i;
+        u32 mvs;
+        u8 maxvid;
+        u32 cpst = 0;
+        u32 thiscpuid;
+        for (i = 0xc0000; i < 0xffff0; i += 0x10) {
+                /* Scan BIOS looking for the signature. */
+                /* It can not be at ffff0 - it is too big. */
+                psb = phys_to_virt(i);
+                if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
+                        continue;
+                dprintk("found PSB header at 0x%p\n", psb);
+                dprintk("table vers: 0x%x\n", psb->tableversion);
+                if (psb->tableversion != PSB_VERSION_1_4) {
+                        printk(KERN_ERR BFX "PSB table is not v1.4\n");
+                        return -ENODEV;
+                }
+                dprintk("flags: 0x%x\n", psb->flags1);
+                if (psb->flags1) {
+                        printk(KERN_ERR BFX "unknown flags\n");
+                        return -ENODEV;
+                }
+                data->vstable = psb->vstable;
+                dprintk("voltage stabilization time: %d(*20us)\n", data->vstable);
+                dprintk("flags2: 0x%x\n", psb->flags2);
+                data->rvo = psb->flags2 & 3;
+                data->irt = ((psb->flags2) >> 2) & 3;
+                mvs = ((psb->flags2) >> 4) & 3;
+                data->vidmvs = 1 << mvs;
+                data->batps = ((psb->flags2) >> 6) & 3;
+                dprintk("ramp voltage offset: %d\n", data->rvo);
+                dprintk("isochronous relief time: %d\n", data->irt);
+                dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
+                dprintk("numpst: 0x%x\n", psb->num_tables);
+                cpst = psb->num_tables;
+                if ((psb->cpuid == 0x00000fc0) || (psb->cpuid == 0x00000fe0) ){
+                        thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+                        if ((thiscpuid == 0x00000fc0) || (thiscpuid == 0x00000fe0) ) {
+                                cpst = 1;
+                        }
+                }
+                if (cpst != 1) {
+                        printk(KERN_ERR BFX "numpst must be 1\n");
+                        return -ENODEV;
+                }
+                data->plllock = psb->plllocktime;
+                dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
+                dprintk("maxfid: 0x%x\n", psb->maxfid);
+                dprintk("maxvid: 0x%x\n", psb->maxvid);
+                maxvid = psb->maxvid;
+                data->numps = psb->numps;
+                dprintk("numpstates: 0x%x\n", data->numps);
+                return fill_powernow_table(data, (struct pst_s *)(psb+1), maxvid);
+        }
+        /*
+         * If you see this message, complain to BIOS manufacturer. If
+         * he tells you "we do not support Linux" or some similar
+         * nonsense, remember that Windows 2000 uses the same legacy
+         * mechanism that the old Linux PSB driver uses. Tell them it
+         * is broken with Windows 2000.
+         *
+         * The reference to the AMD documentation is chapter 9 in the
+         * BIOS and Kernel Developer's Guide, which is available on
+         * www.amd.com
+         */
+        printk(KERN_ERR PFX "BIOS error - no PSB or ACPI _PSS objects\n");
+        return -ENODEV;
+}
+#ifdef CONFIG_X86_POWERNOW_K8_ACPI
+static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index)
+{
+        if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
+                return;
+        data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK;
+        data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK;
+        data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
+        data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK;
+        data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK);
+        data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK;
+}
+static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
+{
+        struct cpufreq_frequency_table *powernow_table;
+        int ret_val;
+        if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
+                dprintk("register performance failed: bad ACPI data\n");
+                return -EIO;
+        }
+        /* verify the data contained in the ACPI structures */
+        if (data->acpi_data.state_count <= 1) {
+                dprintk("No ACPI P-States\n");
+                goto err_out;
+        }
+        if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
+                (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
+                dprintk("Invalid control/status registers (%x - %x)\n",
+                        data->acpi_data.control_register.space_id,
+                        data->acpi_data.status_register.space_id);
+                goto err_out;
+        }
+        /* fill in data->powernow_table */
+        powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
+                * (data->acpi_data.state_count + 1)), GFP_KERNEL);
+        if (!powernow_table) {
+                dprintk("powernow_table memory alloc failure\n");
+                goto err_out;
+        }
+        if (cpu_family == CPU_HW_PSTATE)
+                ret_val = fill_powernow_table_pstate(data, powernow_table);
+        else
+                ret_val = fill_powernow_table_fidvid(data, powernow_table);
+        if (ret_val)
+                goto err_out_mem;
+        powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END;
+        powernow_table[data->acpi_data.state_count].index = 0;
+        data->powernow_table = powernow_table;
+        /* fill in data */
+        data->numps = data->acpi_data.state_count;
+        if (first_cpu(cpu_core_map[data->cpu]) == data->cpu)
+                print_basics(data);
+        powernow_k8_acpi_pst_values(data, 0);
+        /* notify BIOS that we exist */
+        acpi_processor_notify_smm(THIS_MODULE);
+        return 0;
+err_out_mem:
+        kfree(powernow_table);
+err_out:
+        acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
+        /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */
+        data->acpi_data.state_count = 0;
+        return -ENODEV;
+}
+static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table)
+{
+        int i;
+        for (i = 0; i < data->acpi_data.state_count; i++) {
+                u32 index;
+                u32 hi = 0, lo = 0;
+                u32 fid;
+                u32 did;
+                index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
+                if (index > MAX_HW_PSTATE) {
+                        printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index);
+                        printk(KERN_ERR PFX "Please report to BIOS manufacturer\n");
+                }
+                rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
+                if (!(hi & HW_PSTATE_VALID_MASK)) {
+                        dprintk("invalid pstate %d, ignoring\n", index);
+                        powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+                        continue;
+                }
+                fid = lo & HW_PSTATE_FID_MASK;
+                did = (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT;
+                dprintk("   %d : fid 0x%x, did 0x%x\n", index, fid, did);
+                powernow_table[i].index = index | (fid << HW_FID_INDEX_SHIFT) | (did << HW_DID_INDEX_SHIFT);
+                powernow_table[i].frequency = find_khz_freq_from_fiddid(fid, did);
+                if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) {
+                        printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n",
+                                powernow_table[i].frequency,
+                                (unsigned int) (data->acpi_data.states[i].core_frequency * 1000));
+                        powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+                        continue;
+                }
+        }
+        return 0;
+}
+static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table)
+{
+        int i;
+        int cntlofreq = 0;
+        for (i = 0; i < data->acpi_data.state_count; i++) {
+                u32 fid;
+                u32 vid;
+                if (data->exttype) {
+                        fid = data->acpi_data.states[i].status & EXT_FID_MASK;
+                        vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK;
+                } else {
+                        fid = data->acpi_data.states[i].control & FID_MASK;
+                        vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK;
+                }
+                dprintk("   %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
+                powernow_table[i].index = fid; /* lower 8 bits */
+                powernow_table[i].index |= (vid << 8); /* upper 8 bits */
+                powernow_table[i].frequency = find_khz_freq_from_fid(fid);
+                /* verify frequency is OK */
+                if ((powernow_table[i].frequency > (MAX_FREQ * 1000)) ||
+                        (powernow_table[i].frequency < (MIN_FREQ * 1000))) {
+                        dprintk("invalid freq %u kHz, ignoring\n", powernow_table[i].frequency);
+                        powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+                        continue;
+                }
+                /* verify voltage is OK - BIOSs are using "off" to indicate invalid */
+                if (vid == VID_OFF) {
+                        dprintk("invalid vid %u, ignoring\n", vid);
+                        powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+                        continue;
+                }
+                /* verify only 1 entry from the lo frequency table */
+                if (fid < HI_FID_TABLE_BOTTOM) {
+                        if (cntlofreq) {
+                                /* if both entries are the same, ignore this one ... */
+                                if ((powernow_table[i].frequency != powernow_table[cntlofreq].frequency) ||
+                                    (powernow_table[i].index != powernow_table[cntlofreq].index)) {
+                                        printk(KERN_ERR PFX "Too many lo freq table entries\n");
+                                        return 1;
+                                }
+                                dprintk("double low frequency table entry, ignoring it.\n");
+                                powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+                                continue;
+                        } else
+                                cntlofreq = i;
+                }
+                if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) {
+                        printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n",
+                                powernow_table[i].frequency,
+                                (unsigned int) (data->acpi_data.states[i].core_frequency * 1000));
+                        powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+                        continue;
+                }
+        }
+        return 0;
+}
+static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
+{
+        if (data->acpi_data.state_count)
+                acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
+}
+#else
+static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; }
+static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; }
+static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; }
+#endif /* CONFIG_X86_POWERNOW_K8_ACPI */
+/* Take a frequency, and issue the fid/vid transition command */
+static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned int index)
+{
+        u32 fid = 0;
+        u32 vid = 0;
+        int res, i;
+        struct cpufreq_freqs freqs;
+        dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
+        /* fid/vid correctness check for k8 */
+        /* fid are the lower 8 bits of the index we stored into
+         * the cpufreq frequency table in find_psb_table, vid
+         * are the upper 8 bits.
+         */
+        fid = data->powernow_table[index].index & 0xFF;
+        vid = (data->powernow_table[index].index & 0xFF00) >> 8;
+        dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
+        if (query_current_values_with_pending_wait(data))
+                return 1;
+        if ((data->currvid == vid) && (data->currfid == fid)) {
+                dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
+                        fid, vid);
+                return 0;
+        }
+        if ((fid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) {
+                printk(KERN_ERR PFX
+                       "ignoring illegal change in lo freq table-%x to 0x%x\n",
+                       data->currfid, fid);
+                return 1;
+        }
+        dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
+                smp_processor_id(), fid, vid);
+        freqs.old = find_khz_freq_from_fid(data->currfid);
+        freqs.new = find_khz_freq_from_fid(fid);
+        for_each_cpu_mask(i, *(data->available_cores)) {
+                freqs.cpu = i;
+                cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        }
+        res = transition_fid_vid(data, fid, vid);
+        freqs.new = find_khz_freq_from_fid(data->currfid);
+        for_each_cpu_mask(i, *(data->available_cores)) {
+                freqs.cpu = i;
+                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+        }
+        return res;
+}
+/* Take a frequency, and issue the hardware pstate transition command */
+static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned int index)
+{
+        u32 fid = 0;
+        u32 did = 0;
+        u32 pstate = 0;
+        int res, i;
+        struct cpufreq_freqs freqs;
+        dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
+        /* get fid did for hardware pstate transition */
+        pstate = index & HW_PSTATE_MASK;
+        if (pstate > MAX_HW_PSTATE)
+                return 0;
+        fid = (index & HW_FID_INDEX_MASK) >> HW_FID_INDEX_SHIFT;
+        did = (index & HW_DID_INDEX_MASK) >> HW_DID_INDEX_SHIFT;
+        freqs.old = find_khz_freq_from_fiddid(data->currfid, data->currdid);
+        freqs.new = find_khz_freq_from_fiddid(fid, did);
+        for_each_cpu_mask(i, *(data->available_cores)) {
+                freqs.cpu = i;
+                cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        }
+        res = transition_pstate(data, pstate);
+        data->currfid = find_fid_from_pstate(pstate);
+        data->currdid = find_did_from_pstate(pstate);
+        freqs.new = find_khz_freq_from_fiddid(data->currfid, data->currdid);
+        for_each_cpu_mask(i, *(data->available_cores)) {
+                freqs.cpu = i;
+                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+        }
+        return res;
+}
+/* Driver entry point to switch to the target frequency */
+static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
+{
+        cpumask_t oldmask = CPU_MASK_ALL;
+        struct powernow_k8_data *data = powernow_data[pol->cpu];
+        u32 checkfid;
+        u32 checkvid;
+        unsigned int newstate;
+        int ret = -EIO;
+        if (!data)
+                return -EINVAL;
+        checkfid = data->currfid;
+        checkvid = data->currvid;
+        /* only run on specific CPU from here on */
+        oldmask = current->cpus_allowed;
+        set_cpus_allowed(current, cpumask_of_cpu(pol->cpu));
+        if (smp_processor_id() != pol->cpu) {
+                printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
+                goto err_out;
+        }
+        if (pending_bit_stuck()) {
+                printk(KERN_ERR PFX "failing targ, change pending bit set\n");
+                goto err_out;
+        }
+        dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
+                pol->cpu, targfreq, pol->min, pol->max, relation);
+        if (query_current_values_with_pending_wait(data))
+                goto err_out;
+        if (cpu_family == CPU_HW_PSTATE)
+                dprintk("targ: curr fid 0x%x, did 0x%x\n",
+                        data->currfid, data->currdid);
+        else {
+                dprintk("targ: curr fid 0x%x, vid 0x%x\n",
+                data->currfid, data->currvid);
+                if ((checkvid != data->currvid) || (checkfid != data->currfid)) {
+                        printk(KERN_INFO PFX
+                                "error - out of sync, fix 0x%x 0x%x, vid 0x%x 0x%x\n",
+                                checkfid, data->currfid, checkvid, data->currvid);
+                }
+        }
+        if (cpufreq_frequency_table_target(pol, data->powernow_table, targfreq, relation, &newstate))
+                goto err_out;
+        mutex_lock(&fidvid_mutex);
+        powernow_k8_acpi_pst_values(data, newstate);
+        if (cpu_family == CPU_HW_PSTATE)
+                ret = transition_frequency_pstate(data, newstate);
+        else
+                ret = transition_frequency_fidvid(data, newstate);
+        if (ret) {
+                printk(KERN_ERR PFX "transition frequency failed\n");
+                ret = 1;
+                mutex_unlock(&fidvid_mutex);
+                goto err_out;
+        }
+        mutex_unlock(&fidvid_mutex);
+        if (cpu_family == CPU_HW_PSTATE)
+                pol->cur = find_khz_freq_from_fiddid(data->currfid, data->currdid);
+        else
+                pol->cur = find_khz_freq_from_fid(data->currfid);
+        ret = 0;
+err_out:
+        set_cpus_allowed(current, oldmask);
+        return ret;
+}
+/* Driver entry point to verify the policy and range of frequencies */
+static int powernowk8_verify(struct cpufreq_policy *pol)
+{
+        struct powernow_k8_data *data = powernow_data[pol->cpu];
+        if (!data)
+                return -EINVAL;
+        return cpufreq_frequency_table_verify(pol, data->powernow_table);
+}
+/* per CPU init entry point to the driver */
+static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
+{
+        struct powernow_k8_data *data;
+        cpumask_t oldmask = CPU_MASK_ALL;
+        int rc;
+        if (!cpu_online(pol->cpu))
+                return -ENODEV;
+        if (!check_supported_cpu(pol->cpu))
+                return -ENODEV;
+        data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
+        if (!data) {
+                printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
+                return -ENOMEM;
+        }
+        data->cpu = pol->cpu;
+        if (powernow_k8_cpu_init_acpi(data)) {
+                /*
+                 * Use the PSB BIOS structure. This is only availabe on
+                 * an UP version, and is deprecated by AMD.
+                 */
+                if (num_online_cpus() != 1) {
+                        printk(KERN_ERR PFX "MP systems not supported by PSB BIOS structure\n");
+                        kfree(data);
+                        return -ENODEV;
+                }
+                if (pol->cpu != 0) {
+                        printk(KERN_ERR PFX "No _PSS objects for CPU other than CPU0\n");
+                        kfree(data);
+                        return -ENODEV;
+                }
+                rc = find_psb_table(data);
+                if (rc) {
+                        kfree(data);
+                        return -ENODEV;
+                }
+        }
+        /* only run on specific CPU from here on */
+        oldmask = current->cpus_allowed;
+        set_cpus_allowed(current, cpumask_of_cpu(pol->cpu));
+        if (smp_processor_id() != pol->cpu) {
+                printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
+                goto err_out;
+        }
+        if (pending_bit_stuck()) {
+                printk(KERN_ERR PFX "failing init, change pending bit set\n");
+                goto err_out;
+        }
+        if (query_current_values_with_pending_wait(data))
+                goto err_out;
+        if (cpu_family == CPU_OPTERON)
+                fidvid_msr_init();
+        /* run on any CPU again */
+        set_cpus_allowed(current, oldmask);
+        pol->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        if (cpu_family == CPU_HW_PSTATE)
+                pol->cpus = cpumask_of_cpu(pol->cpu);
+        else
+                pol->cpus = cpu_core_map[pol->cpu];
+        data->available_cores = &(pol->cpus);
+        /* Take a crude guess here.
+         * That guess was in microseconds, so multiply with 1000 */
+        pol->cpuinfo.transition_latency = (((data->rvo + 8) * data->vstable * VST_UNITS_20US)
+            + (3 * (1 << data->irt) * 10)) * 1000;
+        if (cpu_family == CPU_HW_PSTATE)
+                pol->cur = find_khz_freq_from_fiddid(data->currfid, data->currdid);
+        else
+                pol->cur = find_khz_freq_from_fid(data->currfid);
+        dprintk("policy current frequency %d kHz\n", pol->cur);
+        /* min/max the cpu is capable of */
+        if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
+                printk(KERN_ERR PFX "invalid powernow_table\n");
+                powernow_k8_cpu_exit_acpi(data);
+                kfree(data->powernow_table);
+                kfree(data);
+                return -EINVAL;
+        }
+        cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
+        if (cpu_family == CPU_HW_PSTATE)
+                dprintk("cpu_init done, current fid 0x%x, did 0x%x\n",
+                        data->currfid, data->currdid);
+        else
+                dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
+                        data->currfid, data->currvid);
+        powernow_data[pol->cpu] = data;
+        return 0;
+err_out:
+        set_cpus_allowed(current, oldmask);
+        powernow_k8_cpu_exit_acpi(data);
+        kfree(data);
+        return -ENODEV;
+}
+static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
+{
+        struct powernow_k8_data *data = powernow_data[pol->cpu];
+        if (!data)
+                return -EINVAL;
+        powernow_k8_cpu_exit_acpi(data);
+        cpufreq_frequency_table_put_attr(pol->cpu);
+        kfree(data->powernow_table);
+        kfree(data);
+        return 0;
+}
+static unsigned int powernowk8_get (unsigned int cpu)
+{
+        struct powernow_k8_data *data;
+        cpumask_t oldmask = current->cpus_allowed;
+        unsigned int khz = 0;
+        data = powernow_data[first_cpu(cpu_core_map[cpu])];
+        if (!data)
+                return -EINVAL;
+        set_cpus_allowed(current, cpumask_of_cpu(cpu));
+        if (smp_processor_id() != cpu) {
+                printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu);
+                set_cpus_allowed(current, oldmask);
+                return 0;
+        }
+        if (query_current_values_with_pending_wait(data))
+                goto out;
+        if (cpu_family == CPU_HW_PSTATE)
+                khz = find_khz_freq_from_fiddid(data->currfid, data->currdid);
+        else
+                khz = find_khz_freq_from_fid(data->currfid);
+out:
+        set_cpus_allowed(current, oldmask);
+        return khz;
+}
+static struct freq_attr* powernow_k8_attr[] = {
+        &cpufreq_freq_attr_scaling_available_freqs,
+        NULL,
+};
+static struct cpufreq_driver cpufreq_amd64_driver = {
+        .verify = powernowk8_verify,
+        .target = powernowk8_target,
+        .init = powernowk8_cpu_init,
+        .exit = __devexit_p(powernowk8_cpu_exit),
+        .get = powernowk8_get,
+        .name = "powernow-k8",
+        .owner = THIS_MODULE,
+        .attr = powernow_k8_attr,
+};
+/* driver entry point for init */
+static int __cpuinit powernowk8_init(void)
+{
+        unsigned int i, supported_cpus = 0;
+        unsigned int booted_cores = 1;
+        for_each_online_cpu(i) {
+                if (check_supported_cpu(i))
+                        supported_cpus++;
+        }
+#ifdef CONFIG_SMP
+        booted_cores = cpu_data[0].booted_cores;
+#endif
+        if (supported_cpus == num_online_cpus()) {
+                printk(KERN_INFO PFX "Found %d %s "
+                        "processors (%d cpu cores) (" VERSION ")\n",
+                        supported_cpus/booted_cores,
+                        boot_cpu_data.x86_model_id, supported_cpus);
+                return cpufreq_register_driver(&cpufreq_amd64_driver);
+        }
+        return -ENODEV;
+}
+/* driver entry point for term */
+static void __exit powernowk8_exit(void)
+{
+        dprintk("exit\n");
+        cpufreq_unregister_driver(&cpufreq_amd64_driver);
+}
+MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and Mark Langsdorf <mark.langsdorf@amd.com>");
+MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
+MODULE_LICENSE("GPL");
+late_initcall(powernowk8_init);
+module_exit(powernowk8_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
new file mode 100644
index 000000000000..b06c812208ca
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -0,0 +1,232 @@
+/*
+ *  (c) 2003-2006 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ */
+struct powernow_k8_data {
+        unsigned int cpu;
+        u32 numps;  /* number of p-states */
+        u32 batps;  /* number of p-states supported on battery */
+        /* these values are constant when the PSB is used to determine
+         * vid/fid pairings, but are modified during the ->target() call
+         * when ACPI is used */
+        u32 rvo;     /* ramp voltage offset */
+        u32 irt;     /* isochronous relief time */
+        u32 vidmvs;  /* usable value calculated from mvs */
+        u32 vstable; /* voltage stabilization time, units 20 us */
+        u32 plllock; /* pll lock time, units 1 us */
+        u32 exttype; /* extended interface = 1 */
+        /* keep track of the current fid / vid or did */
+        u32 currvid, currfid, currdid;
+        /* the powernow_table includes all frequency and vid/fid pairings:
+         * fid are the lower 8 bits of the index, vid are the upper 8 bits.
+         * frequency is in kHz */
+        struct cpufreq_frequency_table  *powernow_table;
+#ifdef CONFIG_X86_POWERNOW_K8_ACPI
+        /* the acpi table needs to be kept. it's only available if ACPI was
+         * used to determine valid frequency/vid/fid states */
+        struct acpi_processor_performance acpi_data;
+#endif
+        /* we need to keep track of associated cores, but let cpufreq
+         * handle hotplug events - so just point at cpufreq pol->cpus
+         * structure */
+        cpumask_t *available_cores;
+};
+/* processor's cpuid instruction support */
+#define CPUID_PROCESSOR_SIGNATURE       1       /* function 1 */
+#define CPUID_XFAM                      0x0ff00000      /* extended family */
+#define CPUID_XFAM_K8                   0
+#define CPUID_XMOD                      0x000f0000      /* extended model */
+#define CPUID_XMOD_REV_MASK             0x00080000
+#define CPUID_XFAM_10H                  0x00100000      /* family 0x10 */
+#define CPUID_USE_XFAM_XMOD             0x00000f00
+#define CPUID_GET_MAX_CAPABILITIES      0x80000000
+#define CPUID_FREQ_VOLT_CAPABILITIES    0x80000007
+#define P_STATE_TRANSITION_CAPABLE      6
+/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For     */
+/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and   */
+/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
+/* the register number is placed in ecx, and the data is returned in edx:eax. */
+#define MSR_FIDVID_CTL      0xc0010041
+#define MSR_FIDVID_STATUS   0xc0010042
+/* Field definitions within the FID VID Low Control MSR : */
+#define MSR_C_LO_INIT_FID_VID     0x00010000
+#define MSR_C_LO_NEW_VID          0x00003f00
+#define MSR_C_LO_NEW_FID          0x0000003f
+#define MSR_C_LO_VID_SHIFT        8
+/* Field definitions within the FID VID High Control MSR : */
+#define MSR_C_HI_STP_GNT_TO       0x000fffff
+/* Field definitions within the FID VID Low Status MSR : */
+#define MSR_S_LO_CHANGE_PENDING   0x80000000   /* cleared when completed */
+#define MSR_S_LO_MAX_RAMP_VID     0x3f000000
+#define MSR_S_LO_MAX_FID          0x003f0000
+#define MSR_S_LO_START_FID        0x00003f00
+#define MSR_S_LO_CURRENT_FID      0x0000003f
+/* Field definitions within the FID VID High Status MSR : */
+#define MSR_S_HI_MIN_WORKING_VID  0x3f000000
+#define MSR_S_HI_MAX_WORKING_VID  0x003f0000
+#define MSR_S_HI_START_VID        0x00003f00
+#define MSR_S_HI_CURRENT_VID      0x0000003f
+#define MSR_C_HI_STP_GNT_BENIGN   0x00000001
+/* Hardware Pstate _PSS and MSR definitions */
+#define USE_HW_PSTATE           0x00000080
+#define HW_PSTATE_FID_MASK      0x0000003f
+#define HW_PSTATE_DID_MASK      0x000001c0
+#define HW_PSTATE_DID_SHIFT     6
+#define HW_PSTATE_MASK          0x00000007
+#define HW_PSTATE_VALID_MASK    0x80000000
+#define HW_FID_INDEX_SHIFT      8
+#define HW_FID_INDEX_MASK       0x0000ff00
+#define HW_DID_INDEX_SHIFT      16
+#define HW_DID_INDEX_MASK       0x00ff0000
+#define HW_WATTS_MASK           0xff
+#define HW_PWR_DVR_MASK         0x300
+#define HW_PWR_DVR_SHIFT        8
+#define HW_PWR_MAX_MULT         3
+#define MAX_HW_PSTATE           8       /* hw pstate supports up to 8 */
+#define MSR_PSTATE_DEF_BASE     0xc0010064 /* base of Pstate MSRs */
+#define MSR_PSTATE_STATUS       0xc0010063 /* Pstate Status MSR */
+#define MSR_PSTATE_CTRL         0xc0010062 /* Pstate control MSR */
+/* define the two driver architectures */
+#define CPU_OPTERON 0
+#define CPU_HW_PSTATE 1
+/*
+ * There are restrictions frequencies have to follow:
+ * - only 1 entry in the low fid table ( <=1.4GHz )
+ * - lowest entry in the high fid table must be >= 2 * the entry in the
+ *   low fid table
+ * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
+ *   in the low fid table
+ * - the parts can only step at <= 200 MHz intervals, odd fid values are
+ *   supported in revision G and later revisions.
+ * - lowest frequency must be >= interprocessor hypertransport link speed
+ *   (only applies to MP systems obviously)
+ */
+/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
+#define LO_FID_TABLE_TOP     7  /* fid values marking the boundary    */
+#define HI_FID_TABLE_BOTTOM  8  /* between the low and high tables    */
+#define LO_VCOFREQ_TABLE_TOP    1400    /* corresponding vco frequency values */
+#define HI_VCOFREQ_TABLE_BOTTOM 1600
+#define MIN_FREQ_RESOLUTION  200 /* fids jump by 2 matching freq jumps by 200 */
+#define MAX_FID 0x2a    /* Spec only gives FID values as far as 5 GHz */
+#define LEAST_VID 0x3e  /* Lowest (numerically highest) useful vid value */
+#define MIN_FREQ 800    /* Min and max freqs, per spec */
+#define MAX_FREQ 5000
+#define INVALID_FID_MASK 0xffffffc0  /* not a valid fid if these bits are set */
+#define INVALID_VID_MASK 0xffffffc0  /* not a valid vid if these bits are set */
+#define VID_OFF 0x3f
+#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
+#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
+#define MAXIMUM_VID_STEPS 1  /* Current cpus only allow a single step of 25mV */
+#define VST_UNITS_20US 20   /* Voltage Stabalization Time is in units of 20us */
+/*
+ * Most values of interest are enocoded in a single field of the _PSS
+ * entries: the "control" value.
+ */
+#define IRT_SHIFT      30
+#define RVO_SHIFT      28
+#define EXT_TYPE_SHIFT 27
+#define PLL_L_SHIFT    20
+#define MVS_SHIFT      18
+#define VST_SHIFT      11
+#define VID_SHIFT       6
+#define IRT_MASK        3
+#define RVO_MASK        3
+#define EXT_TYPE_MASK   1
+#define PLL_L_MASK   0x7f
+#define MVS_MASK        3
+#define VST_MASK     0x7f
+#define VID_MASK     0x1f
+#define FID_MASK     0x1f
+#define EXT_VID_MASK 0x3f
+#define EXT_FID_MASK 0x3f
+/*
+ * Version 1.4 of the PSB table. This table is constructed by BIOS and is
+ * to tell the OS's power management driver which VIDs and FIDs are
+ * supported by this particular processor.
+ * If the data in the PSB / PST is wrong, then this driver will program the
+ * wrong values into hardware, which is very likely to lead to a crash.
+ */
+#define PSB_ID_STRING      "AMDK7PNOW!"
+#define PSB_ID_STRING_LEN  10
+#define PSB_VERSION_1_4  0x14
+struct psb_s {
+        u8 signature[10];
+        u8 tableversion;
+        u8 flags1;
+        u16 vstable;
+        u8 flags2;
+        u8 num_tables;
+        u32 cpuid;
+        u8 plllocktime;
+        u8 maxfid;
+        u8 maxvid;
+        u8 numps;
+};
+/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
+struct pst_s {
+        u8 fid;
+        u8 vid;
+};
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
+static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid);
+static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
+static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
+static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
+#ifdef CONFIG_X86_POWERNOW_K8_ACPI
+static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
+static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
+#endif
+#ifdef CONFIG_SMP
+static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
+{
+}
+#else
+static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
+{
+        cpu_set(0, cpu_sharedcore_mask[0]);
+}
+#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
new file mode 100644
index 000000000000..b8fb4b521c62
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
@@ -0,0 +1,191 @@
+/*
+ *      sc520_freq.c: cpufreq driver for the AMD Elan sc520
+ *
+ *      Copyright (C) 2005 Sean Young <sean@mess.org>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *      Based on elanfreq.c
+ *
+ *      2005-03-30: - initial revision
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/cpufreq.h>
+#include <asm/msr.h>
+#include <asm/timex.h>
+#include <asm/io.h>
+#define MMCR_BASE       0xfffef000      /* The default base address */
+#define OFFS_CPUCTL     0x2   /* CPU Control Register */
+static __u8 __iomem *cpuctl;
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "sc520_freq", msg)
+static struct cpufreq_frequency_table sc520_freq_table[] = {
+        {0x01,  100000},
+        {0x02,  133000},
+        {0,     CPUFREQ_TABLE_END},
+};
+static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
+{
+        u8 clockspeed_reg = *cpuctl;
+        switch (clockspeed_reg & 0x03) {
+        default:
+                printk(KERN_ERR "sc520_freq: error: cpuctl register has unexpected value %02x\n", clockspeed_reg);
+        case 0x01:
+                return 100000;
+        case 0x02:
+                return 133000;
+        }
+}
+static void sc520_freq_set_cpu_state (unsigned int state)
+{
+        struct cpufreq_freqs    freqs;
+        u8 clockspeed_reg;
+        freqs.old = sc520_freq_get_cpu_frequency(0);
+        freqs.new = sc520_freq_table[state].frequency;
+        freqs.cpu = 0; /* AMD Elan is UP */
+        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        dprintk("attempting to set frequency to %i kHz\n",
+                        sc520_freq_table[state].frequency);
+        local_irq_disable();
+        clockspeed_reg = *cpuctl & ~0x03;
+        *cpuctl = clockspeed_reg | sc520_freq_table[state].index;
+        local_irq_enable();
+        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+};
+static int sc520_freq_verify (struct cpufreq_policy *policy)
+{
+        return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
+}
+static int sc520_freq_target (struct cpufreq_policy *policy,
+                            unsigned int target_freq,
+                            unsigned int relation)
+{
+        unsigned int newstate = 0;
+        if (cpufreq_frequency_table_target(policy, sc520_freq_table, target_freq, relation, &newstate))
+                return -EINVAL;
+        sc520_freq_set_cpu_state(newstate);
+        return 0;
+}
+/*
+ *      Module init and exit code
+ */
+static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
+{
+        struct cpuinfo_x86 *c = cpu_data;
+        int result;
+        /* capability check */
+        if (c->x86_vendor != X86_VENDOR_AMD ||
+            c->x86 != 4 || c->x86_model != 9)
+                return -ENODEV;
+        /* cpuinfo and default policy values */
+        policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        policy->cpuinfo.transition_latency = 1000000; /* 1ms */
+        policy->cur = sc520_freq_get_cpu_frequency(0);
+        result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
+        if (result)
+                return (result);
+        cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
+        return 0;
+}
+static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
+{
+        cpufreq_frequency_table_put_attr(policy->cpu);
+        return 0;
+}
+static struct freq_attr* sc520_freq_attr[] = {
+        &cpufreq_freq_attr_scaling_available_freqs,
+        NULL,
+};
+static struct cpufreq_driver sc520_freq_driver = {
+        .get    = sc520_freq_get_cpu_frequency,
+        .verify = sc520_freq_verify,
+        .target = sc520_freq_target,
+        .init   = sc520_freq_cpu_init,
+        .exit   = sc520_freq_cpu_exit,
+        .name   = "sc520_freq",
+        .owner  = THIS_MODULE,
+        .attr   = sc520_freq_attr,
+};
+static int __init sc520_freq_init(void)
+{
+        struct cpuinfo_x86 *c = cpu_data;
+        int err;
+        /* Test if we have the right hardware */
+        if(c->x86_vendor != X86_VENDOR_AMD ||
+                                c->x86 != 4 || c->x86_model != 9) {
+                dprintk("no Elan SC520 processor found!\n");
+                return -ENODEV;
+        }
+        cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
+        if(!cpuctl) {
+                printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
+                return -ENOMEM;
+        }
+        err = cpufreq_register_driver(&sc520_freq_driver);
+        if (err)
+                iounmap(cpuctl);
+        return err;
+}
+static void __exit sc520_freq_exit(void)
+{
+        cpufreq_unregister_driver(&sc520_freq_driver);
+        iounmap(cpuctl);
+}
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sean Young <sean@mess.org>");
+MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
+module_init(sc520_freq_init);
+module_exit(sc520_freq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
new file mode 100644
index 000000000000..6c5dc2c85aeb
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -0,0 +1,634 @@
+/*
+ * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
+ * M (part of the Centrino chipset).
+ *
+ * Since the original Pentium M, most new Intel CPUs support Enhanced
+ * SpeedStep.
+ *
+ * Despite the "SpeedStep" in the name, this is almost entirely unlike
+ * traditional SpeedStep.
+ *
+ * Modelled on speedstep.c
+ *
+ * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/sched.h>        /* current */
+#include <linux/delay.h>
+#include <linux/compiler.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+#define PFX             "speedstep-centrino: "
+#define MAINTAINER      "cpufreq@lists.linux.org.uk"
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
+#define INTEL_MSR_RANGE (0xffff)
+struct cpu_id
+{
+        __u8    x86;            /* CPU family */
+        __u8    x86_model;      /* model */
+        __u8    x86_mask;       /* stepping */
+};
+enum {
+        CPU_BANIAS,
+        CPU_DOTHAN_A1,
+        CPU_DOTHAN_A2,
+        CPU_DOTHAN_B0,
+        CPU_MP4HT_D0,
+        CPU_MP4HT_E0,
+};
+static const struct cpu_id cpu_ids[] = {
+        [CPU_BANIAS]    = { 6,  9, 5 },
+        [CPU_DOTHAN_A1] = { 6, 13, 1 },
+        [CPU_DOTHAN_A2] = { 6, 13, 2 },
+        [CPU_DOTHAN_B0] = { 6, 13, 6 },
+        [CPU_MP4HT_D0]  = {15,  3, 4 },
+        [CPU_MP4HT_E0]  = {15,  4, 1 },
+};
+#define N_IDS   ARRAY_SIZE(cpu_ids)
+struct cpu_model
+{
+        const struct cpu_id *cpu_id;
+        const char      *model_name;
+        unsigned        max_freq; /* max clock in kHz */
+        struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
+};
+static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x);
+/* Operating points for current CPU */
+static struct cpu_model *centrino_model[NR_CPUS];
+static const struct cpu_id *centrino_cpu[NR_CPUS];
+static struct cpufreq_driver centrino_driver;
+#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
+/* Computes the correct form for IA32_PERF_CTL MSR for a particular
+   frequency/voltage operating point; frequency in MHz, volts in mV.
+   This is stored as "index" in the structure. */
+#define OP(mhz, mv)                                                     \
+        {                                                               \
+                .frequency = (mhz) * 1000,                              \
+                .index = (((mhz)/100) << 8) | ((mv - 700) / 16)         \
+        }
+/*
+ * These voltage tables were derived from the Intel Pentium M
+ * datasheet, document 25261202.pdf, Table 5.  I have verified they
+ * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
+ * M.
+ */
+/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
+static struct cpufreq_frequency_table banias_900[] =
+{
+        OP(600,  844),
+        OP(800,  988),
+        OP(900, 1004),
+        { .frequency = CPUFREQ_TABLE_END }
+};
+/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
+static struct cpufreq_frequency_table banias_1000[] =
+{
+        OP(600,   844),
+        OP(800,   972),
+        OP(900,   988),
+        OP(1000, 1004),
+        { .frequency = CPUFREQ_TABLE_END }
+};
+/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
+static struct cpufreq_frequency_table banias_1100[] =
+{
+        OP( 600,  956),
+        OP( 800, 1020),
+        OP( 900, 1100),
+        OP(1000, 1164),
+        OP(1100, 1180),
+        { .frequency = CPUFREQ_TABLE_END }
+};
+/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
+static struct cpufreq_frequency_table banias_1200[] =
+{
+        OP( 600,  956),
+        OP( 800, 1004),
+        OP( 900, 1020),
+        OP(1000, 1100),
+        OP(1100, 1164),
+        OP(1200, 1180),
+        { .frequency = CPUFREQ_TABLE_END }
+};
+/* Intel Pentium M processor 1.30GHz (Banias) */
+static struct cpufreq_frequency_table banias_1300[] =
+{
+        OP( 600,  956),
+        OP( 800, 1260),
+        OP(1000, 1292),
+        OP(1200, 1356),
+        OP(1300, 1388),
+        { .frequency = CPUFREQ_TABLE_END }
+};
+/* Intel Pentium M processor 1.40GHz (Banias) */
+static struct cpufreq_frequency_table banias_1400[] =
+{
+        OP( 600,  956),
+        OP( 800, 1180),
+        OP(1000, 1308),
+        OP(1200, 1436),
+        OP(1400, 1484),
+        { .frequency = CPUFREQ_TABLE_END }
+};
+/* Intel Pentium M processor 1.50GHz (Banias) */
+static struct cpufreq_frequency_table banias_1500[] =
+{
+        OP( 600,  956),
+        OP( 800, 1116),
+        OP(1000, 1228),
+        OP(1200, 1356),
+        OP(1400, 1452),
+        OP(1500, 1484),
+        { .frequency = CPUFREQ_TABLE_END }
+};
+/* Intel Pentium M processor 1.60GHz (Banias) */
+static struct cpufreq_frequency_table banias_1600[] =
+{
+        OP( 600,  956),
+        OP( 800, 1036),
+        OP(1000, 1164),
+        OP(1200, 1276),
+        OP(1400, 1420),
+        OP(1600, 1484),
+        { .frequency = CPUFREQ_TABLE_END }
+};
+/* Intel Pentium M processor 1.70GHz (Banias) */
+static struct cpufreq_frequency_table banias_1700[] =
+{
+        OP( 600,  956),
+        OP( 800, 1004),
+        OP(1000, 1116),
+        OP(1200, 1228),
+        OP(1400, 1308),
+        OP(1700, 1484),
+        { .frequency = CPUFREQ_TABLE_END }
+};
+#undef OP
+#define _BANIAS(cpuid, max, name)       \
+{       .cpu_id         = cpuid,        \
+        .model_name     = "Intel(R) Pentium(R) M processor " name "MHz", \
+        .max_freq       = (max)*1000,   \
+        .op_points      = banias_##max, \
+}
+#define BANIAS(max)     _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
+/* CPU models, their operating frequency range, and freq/voltage
+   operating points */
+static struct cpu_model models[] =
+{
+        _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
+        BANIAS(1000),
+        BANIAS(1100),
+        BANIAS(1200),
+        BANIAS(1300),
+        BANIAS(1400),
+        BANIAS(1500),
+        BANIAS(1600),
+        BANIAS(1700),
+        /* NULL model_name is a wildcard */
+        { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
+        { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
+        { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
+        { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
+        { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
+        { NULL, }
+};
+#undef _BANIAS
+#undef BANIAS
+static int centrino_cpu_init_table(struct cpufreq_policy *policy)
+{
+        struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu];
+        struct cpu_model *model;
+        for(model = models; model->cpu_id != NULL; model++)
+                if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
+                    (model->model_name == NULL ||
+                     strcmp(cpu->x86_model_id, model->model_name) == 0))
+                        break;
+        if (model->cpu_id == NULL) {
+                /* No match at all */
+                dprintk("no support for CPU model \"%s\": "
+                       "send /proc/cpuinfo to " MAINTAINER "\n",
+                       cpu->x86_model_id);
+                return -ENOENT;
+        }
+        if (model->op_points == NULL) {
+                /* Matched a non-match */
+                dprintk("no table support for CPU model \"%s\"\n",
+                       cpu->x86_model_id);
+                dprintk("try using the acpi-cpufreq driver\n");
+                return -ENOENT;
+        }
+        centrino_model[policy->cpu] = model;
+        dprintk("found \"%s\": max frequency: %dkHz\n",
+               model->model_name, model->max_freq);
+        return 0;
+}
+#else
+static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; }
+#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
+static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x)
+{
+        if ((c->x86 == x->x86) &&
+            (c->x86_model == x->x86_model) &&
+            (c->x86_mask == x->x86_mask))
+                return 1;
+        return 0;
+}
+/* To be called only after centrino_model is initialized */
+static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
+{
+        int i;
+        /*
+         * Extract clock in kHz from PERF_CTL value
+         * for centrino, as some DSDTs are buggy.
+         * Ideally, this can be done using the acpi_data structure.
+         */
+        if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) ||
+            (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) ||
+            (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) {
+                msr = (msr >> 8) & 0xff;
+                return msr * 100000;
+        }
+        if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points))
+                return 0;
+        msr &= 0xffff;
+        for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) {
+                if (msr == centrino_model[cpu]->op_points[i].index)
+                        return centrino_model[cpu]->op_points[i].frequency;
+        }
+        if (failsafe)
+                return centrino_model[cpu]->op_points[i-1].frequency;
+        else
+                return 0;
+}
+/* Return the current CPU frequency in kHz */
+static unsigned int get_cur_freq(unsigned int cpu)
+{
+        unsigned l, h;
+        unsigned clock_freq;
+        cpumask_t saved_mask;
+        saved_mask = current->cpus_allowed;
+        set_cpus_allowed(current, cpumask_of_cpu(cpu));
+        if (smp_processor_id() != cpu)
+                return 0;
+        rdmsr(MSR_IA32_PERF_STATUS, l, h);
+        clock_freq = extract_clock(l, cpu, 0);
+        if (unlikely(clock_freq == 0)) {
+                /*
+                 * On some CPUs, we can see transient MSR values (which are
+                 * not present in _PSS), while CPU is doing some automatic
+                 * P-state transition (like TM2). Get the last freq set 
+                 * in PERF_CTL.
+                 */
+                rdmsr(MSR_IA32_PERF_CTL, l, h);
+                clock_freq = extract_clock(l, cpu, 1);
+        }
+        set_cpus_allowed(current, saved_mask);
+        return clock_freq;
+}
+static int centrino_cpu_init(struct cpufreq_policy *policy)
+{
+        struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu];
+        unsigned freq;
+        unsigned l, h;
+        int ret;
+        int i;
+        /* Only Intel makes Enhanced Speedstep-capable CPUs */
+        if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST))
+                return -ENODEV;
+        if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
+                centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
+        if (policy->cpu != 0)
+                return -ENODEV;
+        for (i = 0; i < N_IDS; i++)
+                if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
+                        break;
+        if (i != N_IDS)
+                centrino_cpu[policy->cpu] = &cpu_ids[i];
+        if (!centrino_cpu[policy->cpu]) {
+                dprintk("found unsupported CPU with "
+                "Enhanced SpeedStep: send /proc/cpuinfo to "
+                MAINTAINER "\n");
+                return -ENODEV;
+        }
+        if (centrino_cpu_init_table(policy)) {
+                return -ENODEV;
+        }
+        /* Check to see if Enhanced SpeedStep is enabled, and try to
+           enable it if not. */
+        rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+        if (!(l & (1<<16))) {
+                l |= (1<<16);
+                dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
+                wrmsr(MSR_IA32_MISC_ENABLE, l, h);
+                /* check to see if it stuck */
+                rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+                if (!(l & (1<<16))) {
+                        printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n");
+                        return -ENODEV;
+                }
+        }
+        freq = get_cur_freq(policy->cpu);
+        policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */
+        policy->cur = freq;
+        dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
+        ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points);
+        if (ret)
+                return (ret);
+        cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu);
+        return 0;
+}
+static int centrino_cpu_exit(struct cpufreq_policy *policy)
+{
+        unsigned int cpu = policy->cpu;
+        if (!centrino_model[cpu])
+                return -ENODEV;
+        cpufreq_frequency_table_put_attr(cpu);
+        centrino_model[cpu] = NULL;
+        return 0;
+}
+/**
+ * centrino_verify - verifies a new CPUFreq policy
+ * @policy: new policy
+ *
+ * Limit must be within this model's frequency range at least one
+ * border included.
+ */
+static int centrino_verify (struct cpufreq_policy *policy)
+{
+        return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points);
+}
+/**
+ * centrino_setpolicy - set a new CPUFreq policy
+ * @policy: new policy
+ * @target_freq: the target frequency
+ * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
+ *
+ * Sets a new CPUFreq policy.
+ */
+static int centrino_target (struct cpufreq_policy *policy,
+                            unsigned int target_freq,
+                            unsigned int relation)
+{
+        unsigned int    newstate = 0;
+        unsigned int    msr, oldmsr = 0, h = 0, cpu = policy->cpu;
+        struct cpufreq_freqs    freqs;
+        cpumask_t               online_policy_cpus;
+        cpumask_t               saved_mask;
+        cpumask_t               set_mask;
+        cpumask_t               covered_cpus;
+        int                     retval = 0;
+        unsigned int            j, k, first_cpu, tmp;
+        if (unlikely(centrino_model[cpu] == NULL))
+                return -ENODEV;
+        if (unlikely(cpufreq_frequency_table_target(policy,
+                        centrino_model[cpu]->op_points,
+                        target_freq,
+                        relation,
+                        &newstate))) {
+                return -EINVAL;
+        }
+#ifdef CONFIG_HOTPLUG_CPU
+        /* cpufreq holds the hotplug lock, so we are safe from here on */
+        cpus_and(online_policy_cpus, cpu_online_map, policy->cpus);
+#else
+        online_policy_cpus = policy->cpus;
+#endif
+        saved_mask = current->cpus_allowed;
+        first_cpu = 1;
+        cpus_clear(covered_cpus);
+        for_each_cpu_mask(j, online_policy_cpus) {
+                /*
+                 * Support for SMP systems.
+                 * Make sure we are running on CPU that wants to change freq
+                 */
+                cpus_clear(set_mask);
+                if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
+                        cpus_or(set_mask, set_mask, online_policy_cpus);
+                else
+                        cpu_set(j, set_mask);
+                set_cpus_allowed(current, set_mask);
+                preempt_disable();
+                if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) {
+                        dprintk("couldn't limit to CPUs in this domain\n");
+                        retval = -EAGAIN;
+                        if (first_cpu) {
+                                /* We haven't started the transition yet. */
+                                goto migrate_end;
+                        }
+                        preempt_enable();
+                        break;
+                }
+                msr = centrino_model[cpu]->op_points[newstate].index;
+                if (first_cpu) {
+                        rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
+                        if (msr == (oldmsr & 0xffff)) {
+                                dprintk("no change needed - msr was and needs "
+                                        "to be %x\n", oldmsr);
+                                retval = 0;
+                                goto migrate_end;
+                        }
+                        freqs.old = extract_clock(oldmsr, cpu, 0);
+                        freqs.new = extract_clock(msr, cpu, 0);
+                        dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
+                                target_freq, freqs.old, freqs.new, msr);
+                        for_each_cpu_mask(k, online_policy_cpus) {
+                                freqs.cpu = k;
+                                cpufreq_notify_transition(&freqs,
+                                        CPUFREQ_PRECHANGE);
+                        }
+                        first_cpu = 0;
+                        /* all but 16 LSB are reserved, treat them with care */
+                        oldmsr &= ~0xffff;
+                        msr &= 0xffff;
+                        oldmsr |= msr;
+                }
+                wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
+                if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
+                        preempt_enable();
+                        break;
+                }
+                cpu_set(j, covered_cpus);
+                preempt_enable();
+        }
+        for_each_cpu_mask(k, online_policy_cpus) {
+                freqs.cpu = k;
+                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+        }
+        if (unlikely(retval)) {
+                /*
+                 * We have failed halfway through the frequency change.
+                 * We have sent callbacks to policy->cpus and
+                 * MSRs have already been written on coverd_cpus.
+                 * Best effort undo..
+                 */
+                if (!cpus_empty(covered_cpus)) {
+                        for_each_cpu_mask(j, covered_cpus) {
+                                set_cpus_allowed(current, cpumask_of_cpu(j));
+                                wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
+                        }
+                }
+                tmp = freqs.new;
+                freqs.new = freqs.old;
+                freqs.old = tmp;
+                for_each_cpu_mask(j, online_policy_cpus) {
+                        freqs.cpu = j;
+                        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+                        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+                }
+        }
+        set_cpus_allowed(current, saved_mask);
+        return 0;
+migrate_end:
+        preempt_enable();
+        set_cpus_allowed(current, saved_mask);
+        return 0;
+}
+static struct freq_attr* centrino_attr[] = {
+        &cpufreq_freq_attr_scaling_available_freqs,
+        NULL,
+};
+static struct cpufreq_driver centrino_driver = {
+        .name           = "centrino", /* should be speedstep-centrino,
+                                         but there's a 16 char limit */
+        .init           = centrino_cpu_init,
+        .exit           = centrino_cpu_exit,
+        .verify         = centrino_verify,
+        .target         = centrino_target,
+        .get            = get_cur_freq,
+        .attr           = centrino_attr,
+        .owner          = THIS_MODULE,
+};
+/**
+ * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
+ *
+ * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
+ * unsupported devices, -ENOENT if there's no voltage table for this
+ * particular CPU model, -EINVAL on problems during initiatization,
+ * and zero on success.
+ *
+ * This is quite picky.  Not only does the CPU have to advertise the
+ * "est" flag in the cpuid capability flags, we look for a specific
+ * CPU model and stepping, and we need to have the exact model name in
+ * our voltage tables.  That is, be paranoid about not releasing
+ * someone's valuable magic smoke.
+ */
+static int __init centrino_init(void)
+{
+        struct cpuinfo_x86 *cpu = cpu_data;
+        if (!cpu_has(cpu, X86_FEATURE_EST))
+                return -ENODEV;
+        return cpufreq_register_driver(&centrino_driver);
+}
+static void __exit centrino_exit(void)
+{
+        cpufreq_unregister_driver(&centrino_driver);
+}
+MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
+MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
+MODULE_LICENSE ("GPL");
+late_initcall(centrino_init);
+module_exit(centrino_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
new file mode 100644
index 000000000000..a5b2346faf1f
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -0,0 +1,440 @@
+/*
+ * (C) 2001  Dave Jones, Arjan van de ven.
+ * (C) 2002 - 2003  Dominik Brodowski <linux@brodo.de>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *  Based upon reverse engineered information, and on Intel documentation
+ *  for chipsets ICH2-M and ICH3-M.
+ *
+ *  Many thanks to Ducrot Bruno for finding and fixing the last
+ *  "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
+ *  for extensive testing.
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+/*********************************************************************
+ *                        SPEEDSTEP - DEFINITIONS                    *
+ *********************************************************************/
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include "speedstep-lib.h"
+/* speedstep_chipset:
+ *   It is necessary to know which chipset is used. As accesses to
+ * this device occur at various places in this module, we need a
+ * static struct pci_dev * pointing to that device.
+ */
+static struct pci_dev *speedstep_chipset_dev;
+/* speedstep_processor
+ */
+static unsigned int speedstep_processor = 0;
+static u32 pmbase;
+/*
+ *   There are only two frequency states for each processor. Values
+ * are in kHz for the time being.
+ */
+static struct cpufreq_frequency_table speedstep_freqs[] = {
+        {SPEEDSTEP_HIGH,        0},
+        {SPEEDSTEP_LOW,         0},
+        {0,                     CPUFREQ_TABLE_END},
+};
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-ich", msg)
+/**
+ * speedstep_find_register - read the PMBASE address
+ *
+ * Returns: -ENODEV if no register could be found
+ */
+static int speedstep_find_register (void)
+{
+        if (!speedstep_chipset_dev)
+                return -ENODEV;
+        /* get PMBASE */
+        pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
+        if (!(pmbase & 0x01)) {
+                printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
+                return -ENODEV;
+        }
+        pmbase &= 0xFFFFFFFE;
+        if (!pmbase) {
+                printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
+                return -ENODEV;
+        }
+        dprintk("pmbase is 0x%x\n", pmbase);
+        return 0;
+}
+/**
+ * speedstep_set_state - set the SpeedStep state
+ * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
+ *
+ *   Tries to change the SpeedStep state.
+ */
+static void speedstep_set_state (unsigned int state)
+{
+        u8 pm2_blk;
+        u8 value;
+        unsigned long flags;
+        if (state > 0x1)
+                return;
+        /* Disable IRQs */
+        local_irq_save(flags);
+        /* read state */
+        value = inb(pmbase + 0x50);
+        dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
+        /* write new state */
+        value &= 0xFE;
+        value |= state;
+        dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
+        /* Disable bus master arbitration */
+        pm2_blk = inb(pmbase + 0x20);
+        pm2_blk |= 0x01;
+        outb(pm2_blk, (pmbase + 0x20));
+        /* Actual transition */
+        outb(value, (pmbase + 0x50));
+        /* Restore bus master arbitration */
+        pm2_blk &= 0xfe;
+        outb(pm2_blk, (pmbase + 0x20));
+        /* check if transition was successful */
+        value = inb(pmbase + 0x50);
+        /* Enable IRQs */
+        local_irq_restore(flags);
+        dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
+        if (state == (value & 0x1)) {
+                dprintk("change to %u MHz succeeded\n", (speedstep_get_processor_frequency(speedstep_processor) / 1000));
+        } else {
+                printk (KERN_ERR "cpufreq: change failed - I/O error\n");
+        }
+        return;
+}
+/**
+ * speedstep_activate - activate SpeedStep control in the chipset
+ *
+ *   Tries to activate the SpeedStep status and control registers.
+ * Returns -EINVAL on an unsupported chipset, and zero on success.
+ */
+static int speedstep_activate (void)
+{
+        u16 value = 0;
+        if (!speedstep_chipset_dev)
+                return -EINVAL;
+        pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
+        if (!(value & 0x08)) {
+                value |= 0x08;
+                dprintk("activating SpeedStep (TM) registers\n");
+                pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
+        }
+        return 0;
+}
+/**
+ * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
+ *
+ *   Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
+ * the LPC bridge / PM module which contains all power-management
+ * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
+ * chipset, or zero on failure.
+ */
+static unsigned int speedstep_detect_chipset (void)
+{
+        speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
+                              PCI_DEVICE_ID_INTEL_82801DB_12,
+                              PCI_ANY_ID,
+                              PCI_ANY_ID,
+                              NULL);
+        if (speedstep_chipset_dev)
+                return 4; /* 4-M */
+        speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
+                              PCI_DEVICE_ID_INTEL_82801CA_12,
+                              PCI_ANY_ID,
+                              PCI_ANY_ID,
+                              NULL);
+        if (speedstep_chipset_dev)
+                return 3; /* 3-M */
+        speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
+                              PCI_DEVICE_ID_INTEL_82801BA_10,
+                              PCI_ANY_ID,
+                              PCI_ANY_ID,
+                              NULL);
+        if (speedstep_chipset_dev) {
+                /* speedstep.c causes lockups on Dell Inspirons 8000 and
+                 * 8100 which use a pretty old revision of the 82815
+                 * host brige. Abort on these systems.
+                 */
+                static struct pci_dev *hostbridge;
+                hostbridge  = pci_get_subsys(PCI_VENDOR_ID_INTEL,
+                              PCI_DEVICE_ID_INTEL_82815_MC,
+                              PCI_ANY_ID,
+                              PCI_ANY_ID,
+                              NULL);
+                if (!hostbridge)
+                        return 2; /* 2-M */
+                if (hostbridge->revision < 5) {
+                        dprintk("hostbridge does not support speedstep\n");
+                        speedstep_chipset_dev = NULL;
+                        pci_dev_put(hostbridge);
+                        return 0;
+                }
+                pci_dev_put(hostbridge);
+                return 2; /* 2-M */
+        }
+        return 0;
+}
+static unsigned int _speedstep_get(cpumask_t cpus)
+{
+        unsigned int speed;
+        cpumask_t cpus_allowed;
+        cpus_allowed = current->cpus_allowed;
+        set_cpus_allowed(current, cpus);
+        speed = speedstep_get_processor_frequency(speedstep_processor);
+        set_cpus_allowed(current, cpus_allowed);
+        dprintk("detected %u kHz as current frequency\n", speed);
+        return speed;
+}
+static unsigned int speedstep_get(unsigned int cpu)
+{
+        return _speedstep_get(cpumask_of_cpu(cpu));
+}
+/**
+ * speedstep_target - set a new CPUFreq policy
+ * @policy: new policy
+ * @target_freq: the target frequency
+ * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
+ *
+ * Sets a new CPUFreq policy.
+ */
+static int speedstep_target (struct cpufreq_policy *policy,
+                             unsigned int target_freq,
+                             unsigned int relation)
+{
+        unsigned int newstate = 0;
+        struct cpufreq_freqs freqs;
+        cpumask_t cpus_allowed;
+        int i;
+        if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate))
+                return -EINVAL;
+        freqs.old = _speedstep_get(policy->cpus);
+        freqs.new = speedstep_freqs[newstate].frequency;
+        freqs.cpu = policy->cpu;
+        dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
+        /* no transition necessary */
+        if (freqs.old == freqs.new)
+                return 0;
+        cpus_allowed = current->cpus_allowed;
+        for_each_cpu_mask(i, policy->cpus) {
+                freqs.cpu = i;
+                cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        }
+        /* switch to physical CPU where state is to be changed */
+        set_cpus_allowed(current, policy->cpus);
+        speedstep_set_state(newstate);
+        /* allow to be run on all CPUs */
+        set_cpus_allowed(current, cpus_allowed);
+        for_each_cpu_mask(i, policy->cpus) {
+                freqs.cpu = i;
+                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+        }
+        return 0;
+}
+/**
+ * speedstep_verify - verifies a new CPUFreq policy
+ * @policy: new policy
+ *
+ * Limit must be within speedstep_low_freq and speedstep_high_freq, with
+ * at least one border included.
+ */
+static int speedstep_verify (struct cpufreq_policy *policy)
+{
+        return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
+}
+static int speedstep_cpu_init(struct cpufreq_policy *policy)
+{
+        int result = 0;
+        unsigned int speed;
+        cpumask_t cpus_allowed;
+        /* only run on CPU to be set, or on its sibling */
+#ifdef CONFIG_SMP
+        policy->cpus = cpu_sibling_map[policy->cpu];
+#endif
+        cpus_allowed = current->cpus_allowed;
+        set_cpus_allowed(current, policy->cpus);
+        /* detect low and high frequency and transition latency */
+        result = speedstep_get_freqs(speedstep_processor,
+                                     &speedstep_freqs[SPEEDSTEP_LOW].frequency,
+                                     &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
+                                     &policy->cpuinfo.transition_latency,
+                                     &speedstep_set_state);
+        set_cpus_allowed(current, cpus_allowed);
+        if (result)
+                return result;
+        /* get current speed setting */
+        speed = _speedstep_get(policy->cpus);
+        if (!speed)
+                return -EIO;
+        dprintk("currently at %s speed setting - %i MHz\n",
+                (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high",
+                (speed / 1000));
+        /* cpuinfo and default policy values */
+        policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        policy->cur = speed;
+        result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
+        if (result)
+                return (result);
+        cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
+        return 0;
+}
+static int speedstep_cpu_exit(struct cpufreq_policy *policy)
+{
+        cpufreq_frequency_table_put_attr(policy->cpu);
+        return 0;
+}
+static struct freq_attr* speedstep_attr[] = {
+        &cpufreq_freq_attr_scaling_available_freqs,
+        NULL,
+};
+static struct cpufreq_driver speedstep_driver = {
+        .name   = "speedstep-ich",
+        .verify = speedstep_verify,
+        .target = speedstep_target,
+        .init   = speedstep_cpu_init,
+        .exit   = speedstep_cpu_exit,
+        .get    = speedstep_get,
+        .owner  = THIS_MODULE,
+        .attr   = speedstep_attr,
+};
+/**
+ * speedstep_init - initializes the SpeedStep CPUFreq driver
+ *
+ *   Initializes the SpeedStep support. Returns -ENODEV on unsupported
+ * devices, -EINVAL on problems during initiatization, and zero on
+ * success.
+ */
+static int __init speedstep_init(void)
+{
+        /* detect processor */
+        speedstep_processor = speedstep_detect_processor();
+        if (!speedstep_processor) {
+                dprintk("Intel(R) SpeedStep(TM) capable processor not found\n");
+                return -ENODEV;
+        }
+        /* detect chipset */
+        if (!speedstep_detect_chipset()) {
+                dprintk("Intel(R) SpeedStep(TM) for this chipset not (yet) available.\n");
+                return -ENODEV;
+        }
+        /* activate speedstep support */
+        if (speedstep_activate()) {
+                pci_dev_put(speedstep_chipset_dev);
+                return -EINVAL;
+        }
+        if (speedstep_find_register())
+                return -ENODEV;
+        return cpufreq_register_driver(&speedstep_driver);
+}
+/**
+ * speedstep_exit - unregisters SpeedStep support
+ *
+ *   Unregisters SpeedStep support.
+ */
+static void __exit speedstep_exit(void)
+{
+        pci_dev_put(speedstep_chipset_dev);
+        cpufreq_unregister_driver(&speedstep_driver);
+}
+MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
+MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges.");
+MODULE_LICENSE ("GPL");
+module_init(speedstep_init);
+module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
new file mode 100644
index 000000000000..b1acc8ce3167
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -0,0 +1,444 @@
+/*
+ * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  Library for common functions for Intel SpeedStep v.1 and v.2 support
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+#include <asm/msr.h>
+#include "speedstep-lib.h"
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-lib", msg)
+#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
+static int relaxed_check = 0;
+#else
+#define relaxed_check 0
+#endif
+/*********************************************************************
+ *                   GET PROCESSOR CORE SPEED IN KHZ                 *
+ *********************************************************************/
+static unsigned int pentium3_get_frequency (unsigned int processor)
+{
+        /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
+        struct {
+                unsigned int ratio;     /* Frequency Multiplier (x10) */
+                u8 bitmap;              /* power on configuration bits
+                                        [27, 25:22] (in MSR 0x2a) */
+        } msr_decode_mult [] = {
+                { 30, 0x01 },
+                { 35, 0x05 },
+                { 40, 0x02 },
+                { 45, 0x06 },
+                { 50, 0x00 },
+                { 55, 0x04 },
+                { 60, 0x0b },
+                { 65, 0x0f },
+                { 70, 0x09 },
+                { 75, 0x0d },
+                { 80, 0x0a },
+                { 85, 0x26 },
+                { 90, 0x20 },
+                { 100, 0x2b },
+                { 0, 0xff }     /* error or unknown value */
+        };
+        /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
+        struct {
+                unsigned int value;     /* Front Side Bus speed in MHz */
+                u8 bitmap;              /* power on configuration bits [18: 19]
+                                        (in MSR 0x2a) */
+        } msr_decode_fsb [] = {
+                {  66, 0x0 },
+                { 100, 0x2 },
+                { 133, 0x1 },
+                {   0, 0xff}
+        };
+        u32 msr_lo, msr_tmp;
+        int i = 0, j = 0;
+        /* read MSR 0x2a - we only need the low 32 bits */
+        rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
+        dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
+        msr_tmp = msr_lo;
+        /* decode the FSB */
+        msr_tmp &= 0x00c0000;
+        msr_tmp >>= 18;
+        while (msr_tmp != msr_decode_fsb[i].bitmap) {
+                if (msr_decode_fsb[i].bitmap == 0xff)
+                        return 0;
+                i++;
+        }
+        /* decode the multiplier */
+        if (processor == SPEEDSTEP_PROCESSOR_PIII_C_EARLY) {
+                dprintk("workaround for early PIIIs\n");
+                msr_lo &= 0x03c00000;
+        } else
+                msr_lo &= 0x0bc00000;
+        msr_lo >>= 22;
+        while (msr_lo != msr_decode_mult[j].bitmap) {
+                if (msr_decode_mult[j].bitmap == 0xff)
+                        return 0;
+                j++;
+        }
+        dprintk("speed is %u\n", (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
+        return (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100);
+}
+static unsigned int pentiumM_get_frequency(void)
+{
+        u32 msr_lo, msr_tmp;
+        rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
+        dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
+        /* see table B-2 of 24547212.pdf */
+        if (msr_lo & 0x00040000) {
+                printk(KERN_DEBUG "speedstep-lib: PM - invalid FSB: 0x%x 0x%x\n", msr_lo, msr_tmp);
+                return 0;
+        }
+        msr_tmp = (msr_lo >> 22) & 0x1f;
+        dprintk("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * 100 * 1000));
+        return (msr_tmp * 100 * 1000);
+}
+static unsigned int pentium_core_get_frequency(void)
+{
+        u32 fsb = 0;
+        u32 msr_lo, msr_tmp;
+        rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
+        /* see table B-2 of 25366920.pdf */
+        switch (msr_lo & 0x07) {
+        case 5:
+                fsb = 100000;
+                break;
+        case 1:
+                fsb = 133333;
+                break;
+        case 3:
+                fsb = 166667;
+                break;
+        default:
+                printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
+        }
+        rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
+        dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
+        msr_tmp = (msr_lo >> 22) & 0x1f;
+        dprintk("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * fsb));
+        return (msr_tmp * fsb);
+}
+static unsigned int pentium4_get_frequency(void)
+{
+        struct cpuinfo_x86 *c = &boot_cpu_data;
+        u32 msr_lo, msr_hi, mult;
+        unsigned int fsb = 0;
+        rdmsr(0x2c, msr_lo, msr_hi);
+        dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
+        /* decode the FSB: see IA-32 Intel (C) Architecture Software
+         * Developer's Manual, Volume 3: System Prgramming Guide,
+         * revision #12 in Table B-1: MSRs in the Pentium 4 and
+         * Intel Xeon Processors, on page B-4 and B-5.
+         */
+        if (c->x86_model < 2)
+                fsb = 100 * 1000;
+        else {
+                u8 fsb_code = (msr_lo >> 16) & 0x7;
+                switch (fsb_code) {
+                case 0:
+                        fsb = 100 * 1000;
+                        break;
+                case 1:
+                        fsb = 13333 * 10;
+                        break;
+                case 2:
+                        fsb = 200 * 1000;
+                        break;
+                }
+        }
+        if (!fsb)
+                printk(KERN_DEBUG "speedstep-lib: couldn't detect FSB speed. Please send an e-mail to <linux@brodo.de>\n");
+        /* Multiplier. */
+        if (c->x86_model < 2)
+                mult = msr_lo >> 27;
+        else
+                mult = msr_lo >> 24;
+        dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult));
+        return (fsb * mult);
+}
+unsigned int speedstep_get_processor_frequency(unsigned int processor)
+{
+        switch (processor) {
+        case SPEEDSTEP_PROCESSOR_PCORE:
+                return pentium_core_get_frequency();
+        case SPEEDSTEP_PROCESSOR_PM:
+                return pentiumM_get_frequency();
+        case SPEEDSTEP_PROCESSOR_P4D:
+        case SPEEDSTEP_PROCESSOR_P4M:
+                return pentium4_get_frequency();
+        case SPEEDSTEP_PROCESSOR_PIII_T:
+        case SPEEDSTEP_PROCESSOR_PIII_C:
+        case SPEEDSTEP_PROCESSOR_PIII_C_EARLY:
+                return pentium3_get_frequency(processor);
+        default:
+                return 0;
+        };
+        return 0;
+}
+EXPORT_SYMBOL_GPL(speedstep_get_processor_frequency);
+/*********************************************************************
+ *                 DETECT SPEEDSTEP-CAPABLE PROCESSOR                *
+ *********************************************************************/
+unsigned int speedstep_detect_processor (void)
+{
+        struct cpuinfo_x86 *c = cpu_data;
+        u32 ebx, msr_lo, msr_hi;
+        dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
+        if ((c->x86_vendor != X86_VENDOR_INTEL) ||
+            ((c->x86 != 6) && (c->x86 != 0xF)))
+                return 0;
+        if (c->x86 == 0xF) {
+                /* Intel Mobile Pentium 4-M
+                 * or Intel Mobile Pentium 4 with 533 MHz FSB */
+                if (c->x86_model != 2)
+                        return 0;
+                ebx = cpuid_ebx(0x00000001);
+                ebx &= 0x000000FF;
+                dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
+                switch (c->x86_mask) {
+                case 4:
+                        /*
+                         * B-stepping [M-P4-M]
+                         * sample has ebx = 0x0f, production has 0x0e.
+                         */
+                        if ((ebx == 0x0e) || (ebx == 0x0f))
+                                return SPEEDSTEP_PROCESSOR_P4M;
+                        break;
+                case 7:
+                        /*
+                         * C-stepping [M-P4-M]
+                         * needs to have ebx=0x0e, else it's a celeron:
+                         * cf. 25130917.pdf / page 7, footnote 5 even
+                         * though 25072120.pdf / page 7 doesn't say
+                         * samples are only of B-stepping...
+                         */
+                        if (ebx == 0x0e)
+                                return SPEEDSTEP_PROCESSOR_P4M;
+                        break;
+                case 9:
+                        /*
+                         * D-stepping [M-P4-M or M-P4/533]
+                         *
+                         * this is totally strange: CPUID 0x0F29 is
+                         * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
+                         * The latter need to be sorted out as they don't
+                         * support speedstep.
+                         * Celerons with CPUID 0x0F29 may have either
+                         * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
+                         * specific.
+                         * M-P4-Ms may have either ebx=0xe or 0xf [see above]
+                         * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
+                         * also, M-P4M HTs have ebx=0x8, too
+                         * For now, they are distinguished by the model_id string
+                         */
+                        if ((ebx == 0x0e) || (strstr(c->x86_model_id,"Mobile Intel(R) Pentium(R) 4") != NULL))
+                                return SPEEDSTEP_PROCESSOR_P4M;
+                        break;
+                default:
+                        break;
+                }
+                return 0;
+        }
+        switch (c->x86_model) {
+        case 0x0B: /* Intel PIII [Tualatin] */
+                /* cpuid_ebx(1) is 0x04 for desktop PIII, 0x06 for mobile PIII-M */
+                ebx = cpuid_ebx(0x00000001);
+                dprintk("ebx is %x\n", ebx);
+                ebx &= 0x000000FF;
+                if (ebx != 0x06)
+                        return 0;
+                /* So far all PIII-M processors support SpeedStep. See
+                 * Intel's 24540640.pdf of June 2003
+                 */
+                return SPEEDSTEP_PROCESSOR_PIII_T;
+        case 0x08: /* Intel PIII [Coppermine] */
+                /* all mobile PIII Coppermines have FSB 100 MHz
+                 * ==> sort out a few desktop PIIIs. */
+                rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
+                dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", msr_lo, msr_hi);
+                msr_lo &= 0x00c0000;
+                if (msr_lo != 0x0080000)
+                        return 0;
+                /*
+                 * If the processor is a mobile version,
+                 * platform ID has bit 50 set
+                 * it has SpeedStep technology if either
+                 * bit 56 or 57 is set
+                 */
+                rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
+                dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", msr_lo, msr_hi);
+                if ((msr_hi & (1<<18)) && (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
+                        if (c->x86_mask == 0x01) {
+                                dprintk("early PIII version\n");
+                                return SPEEDSTEP_PROCESSOR_PIII_C_EARLY;
+                        } else
+                                return SPEEDSTEP_PROCESSOR_PIII_C;
+                }
+        default:
+                return 0;
+        }
+}
+EXPORT_SYMBOL_GPL(speedstep_detect_processor);
+/*********************************************************************
+ *                     DETECT SPEEDSTEP SPEEDS                       *
+ *********************************************************************/
+unsigned int speedstep_get_freqs(unsigned int processor,
+                                  unsigned int *low_speed,
+                                  unsigned int *high_speed,
+                                  unsigned int *transition_latency,
+                                  void (*set_state) (unsigned int state))
+{
+        unsigned int prev_speed;
+        unsigned int ret = 0;
+        unsigned long flags;
+        struct timeval tv1, tv2;
+        if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
+                return -EINVAL;
+        dprintk("trying to determine both speeds\n");
+        /* get current speed */
+        prev_speed = speedstep_get_processor_frequency(processor);
+        if (!prev_speed)
+                return -EIO;
+        dprintk("previous speed is %u\n", prev_speed);
+        local_irq_save(flags);
+        /* switch to low state */
+        set_state(SPEEDSTEP_LOW);
+        *low_speed = speedstep_get_processor_frequency(processor);
+        if (!*low_speed) {
+                ret = -EIO;
+                goto out;
+        }
+        dprintk("low speed is %u\n", *low_speed);
+        /* start latency measurement */
+        if (transition_latency)
+                do_gettimeofday(&tv1);
+        /* switch to high state */
+        set_state(SPEEDSTEP_HIGH);
+        /* end latency measurement */
+        if (transition_latency)
+                do_gettimeofday(&tv2);
+        *high_speed = speedstep_get_processor_frequency(processor);
+        if (!*high_speed) {
+                ret = -EIO;
+                goto out;
+        }
+        dprintk("high speed is %u\n", *high_speed);
+        if (*low_speed == *high_speed) {
+                ret = -ENODEV;
+                goto out;
+        }
+        /* switch to previous state, if necessary */
+        if (*high_speed != prev_speed)
+                set_state(SPEEDSTEP_LOW);
+        if (transition_latency) {
+                *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
+                        tv2.tv_usec - tv1.tv_usec;
+                dprintk("transition latency is %u uSec\n", *transition_latency);
+                /* convert uSec to nSec and add 20% for safety reasons */
+                *transition_latency *= 1200;
+                /* check if the latency measurement is too high or too low
+                 * and set it to a safe value (500uSec) in that case
+                 */
+                if (*transition_latency > 10000000 || *transition_latency < 50000) {
+                        printk (KERN_WARNING "speedstep: frequency transition measured seems out of "
+                                        "range (%u nSec), falling back to a safe one of %u nSec.\n",
+                                        *transition_latency, 500000);
+                        *transition_latency = 500000;
+                }
+        }
+out:
+        local_irq_restore(flags);
+        return (ret);
+}
+EXPORT_SYMBOL_GPL(speedstep_get_freqs);
+#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
+module_param(relaxed_check, int, 0444);
+MODULE_PARM_DESC(relaxed_check, "Don't do all checks for speedstep capability.");
+#endif
+MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>");
+MODULE_DESCRIPTION ("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
+MODULE_LICENSE ("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
new file mode 100644
index 000000000000..b11bcc608cac
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
@@ -0,0 +1,49 @@
+/*
+ * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  Library for common functions for Intel SpeedStep v.1 and v.2 support
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+/* processors */
+#define SPEEDSTEP_PROCESSOR_PIII_C_EARLY        0x00000001  /* Coppermine core */
+#define SPEEDSTEP_PROCESSOR_PIII_C              0x00000002  /* Coppermine core */
+#define SPEEDSTEP_PROCESSOR_PIII_T              0x00000003  /* Tualatin core */
+#define SPEEDSTEP_PROCESSOR_P4M                 0x00000004  /* P4-M  */
+/* the following processors are not speedstep-capable and are not auto-detected
+ * in speedstep_detect_processor(). However, their speed can be detected using
+ * the speedstep_get_processor_frequency() call. */
+#define SPEEDSTEP_PROCESSOR_PM                  0xFFFFFF03  /* Pentium M  */
+#define SPEEDSTEP_PROCESSOR_P4D                 0xFFFFFF04  /* desktop P4  */
+#define SPEEDSTEP_PROCESSOR_PCORE               0xFFFFFF05  /* Core */
+/* speedstep states -- only two of them */
+#define SPEEDSTEP_HIGH  0x00000000
+#define SPEEDSTEP_LOW   0x00000001
+/* detect a speedstep-capable processor */
+extern unsigned int speedstep_detect_processor (void);
+/* detect the current speed (in khz) of the processor */
+extern unsigned int speedstep_get_processor_frequency(unsigned int processor);
+/* detect the low and high speeds of the processor. The callback
+ * set_state"'s first argument is either SPEEDSTEP_HIGH or
+ * SPEEDSTEP_LOW; the second argument is zero so that no
+ * cpufreq_notify_transition calls are initiated.
+ */
+extern unsigned int speedstep_get_freqs(unsigned int processor,
+        unsigned int *low_speed,
+        unsigned int *high_speed,
+        unsigned int *transition_latency,
+        void (*set_state) (unsigned int state));
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
new file mode 100644
index 000000000000..e1c509aa3054
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -0,0 +1,424 @@
+/*
+ * Intel SpeedStep SMI driver.
+ *
+ * (C) 2003  Hiroshi Miura <miura@da-cha.org>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ */
+/*********************************************************************
+ *                        SPEEDSTEP - DEFINITIONS                    *
+ *********************************************************************/
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <asm/ist.h>
+#include <asm/io.h>
+#include "speedstep-lib.h"
+/* speedstep system management interface port/command.
+ *
+ * These parameters are got from IST-SMI BIOS call.
+ * If user gives it, these are used.
+ *
+ */
+static int smi_port = 0;
+static int smi_cmd = 0;
+static unsigned int smi_sig = 0;
+/* info about the processor */
+static unsigned int speedstep_processor = 0;
+/*
+ * There are only two frequency states for each processor. Values
+ * are in kHz for the time being.
+ */
+static struct cpufreq_frequency_table speedstep_freqs[] = {
+        {SPEEDSTEP_HIGH,        0},
+        {SPEEDSTEP_LOW,         0},
+        {0,                     CPUFREQ_TABLE_END},
+};
+#define GET_SPEEDSTEP_OWNER 0
+#define GET_SPEEDSTEP_STATE 1
+#define SET_SPEEDSTEP_STATE 2
+#define GET_SPEEDSTEP_FREQS 4
+/* how often shall the SMI call be tried if it failed, e.g. because
+ * of DMA activity going on? */
+#define SMI_TRIES 5
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-smi", msg)
+/**
+ * speedstep_smi_ownership
+ */
+static int speedstep_smi_ownership (void)
+{
+        u32 command, result, magic;
+        u32 function = GET_SPEEDSTEP_OWNER;
+        unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
+        command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
+        magic = virt_to_phys(magic_data);
+        dprintk("trying to obtain ownership with command %x at port %x\n", command, smi_port);
+        __asm__ __volatile__(
+                "out %%al, (%%dx)\n"
+                : "=D" (result)
+                : "a" (command), "b" (function), "c" (0), "d" (smi_port),
+                        "D" (0), "S" (magic)
+                : "memory"
+        );
+        dprintk("result is %x\n", result);
+        return result;
+}
+/**
+ * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
+ * @low: the low frequency value is placed here
+ * @high: the high frequency value is placed here
+ *
+ * Only available on later SpeedStep-enabled systems, returns false results or
+ * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
+ * shows that the latter occurs if !(ist_info.event & 0xFFFF).
+ */
+static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
+{
+        u32 command, result = 0, edi, high_mhz, low_mhz;
+        u32 state=0;
+        u32 function = GET_SPEEDSTEP_FREQS;
+        if (!(ist_info.event & 0xFFFF)) {
+                dprintk("bug #1422 -- can't read freqs from BIOS\n");
+                return -ENODEV;
+        }
+        command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
+        dprintk("trying to determine frequencies with command %x at port %x\n", command, smi_port);
+        __asm__ __volatile__("movl $0, %%edi\n"
+                "out %%al, (%%dx)\n"
+                : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi)
+                : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0)
+        );
+        dprintk("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz);
+        /* abort if results are obviously incorrect... */
+        if ((high_mhz + low_mhz) < 600)
+                return -EINVAL;
+        *high = high_mhz * 1000;
+        *low  = low_mhz  * 1000;
+        return result;
+}
+/**
+ * speedstep_get_state - set the SpeedStep state
+ * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
+ *
+ */
+static int speedstep_get_state (void)
+{
+        u32 function=GET_SPEEDSTEP_STATE;
+        u32 result, state, edi, command;
+        command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
+        dprintk("trying to determine current setting with command %x at port %x\n", command, smi_port);
+        __asm__ __volatile__("movl $0, %%edi\n"
+                "out %%al, (%%dx)\n"
+                : "=a" (result), "=b" (state), "=D" (edi)
+                : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0)
+        );
+        dprintk("state is %x, result is %x\n", state, result);
+        return (state & 1);
+}
+/**
+ * speedstep_set_state - set the SpeedStep state
+ * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
+ *
+ */
+static void speedstep_set_state (unsigned int state)
+{
+        unsigned int result = 0, command, new_state;
+        unsigned long flags;
+        unsigned int function=SET_SPEEDSTEP_STATE;
+        unsigned int retry = 0;
+        if (state > 0x1)
+                return;
+        /* Disable IRQs */
+        local_irq_save(flags);
+        command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
+        dprintk("trying to set frequency to state %u with command %x at port %x\n", state, command, smi_port);
+        do {
+                if (retry) {
+                        dprintk("retry %u, previous result %u, waiting...\n", retry, result);
+                        mdelay(retry * 50);
+                }
+                retry++;
+                __asm__ __volatile__(
+                        "movl $0, %%edi\n"
+                        "out %%al, (%%dx)\n"
+                        : "=b" (new_state), "=D" (result)
+                        : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0)
+                        );
+        } while ((new_state != state) && (retry <= SMI_TRIES));
+        /* enable IRQs */
+        local_irq_restore(flags);
+        if (new_state == state) {
+                dprintk("change to %u MHz succeeded after %u tries with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result);
+        } else {
+                printk(KERN_ERR "cpufreq: change failed with new_state %u and result %u\n", new_state, result);
+        }
+        return;
+}
+/**
+ * speedstep_target - set a new CPUFreq policy
+ * @policy: new policy
+ * @target_freq: new freq
+ * @relation:
+ *
+ * Sets a new CPUFreq policy/freq.
+ */
+static int speedstep_target (struct cpufreq_policy *policy,
+                        unsigned int target_freq, unsigned int relation)
+{
+        unsigned int newstate = 0;
+        struct cpufreq_freqs freqs;
+        if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate))
+                return -EINVAL;
+        freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
+        freqs.new = speedstep_freqs[newstate].frequency;
+        freqs.cpu = 0; /* speedstep.c is UP only driver */
+        if (freqs.old == freqs.new)
+                return 0;
+        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+        speedstep_set_state(newstate);
+        cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+        return 0;
+}
+/**
+ * speedstep_verify - verifies a new CPUFreq policy
+ * @policy: new policy
+ *
+ * Limit must be within speedstep_low_freq and speedstep_high_freq, with
+ * at least one border included.
+ */
+static int speedstep_verify (struct cpufreq_policy *policy)
+{
+        return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
+}
+static int speedstep_cpu_init(struct cpufreq_policy *policy)
+{
+        int result;
+        unsigned int speed,state;
+        /* capability check */
+        if (policy->cpu != 0)
+                return -ENODEV;
+        result = speedstep_smi_ownership();
+        if (result) {
+                dprintk("fails in aquiring ownership of a SMI interface.\n");
+                return -EINVAL;
+        }
+        /* detect low and high frequency */
+        result = speedstep_smi_get_freqs(&speedstep_freqs[SPEEDSTEP_LOW].frequency,
+                                &speedstep_freqs[SPEEDSTEP_HIGH].frequency);
+        if (result) {
+                /* fall back to speedstep_lib.c dection mechanism: try both states out */
+                dprintk("could not detect low and high frequencies by SMI call.\n");
+                result = speedstep_get_freqs(speedstep_processor,
+                                &speedstep_freqs[SPEEDSTEP_LOW].frequency,
+                                &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
+                                NULL,
+                                &speedstep_set_state);
+                if (result) {
+                        dprintk("could not detect two different speeds -- aborting.\n");
+                        return result;
+                } else
+                        dprintk("workaround worked.\n");
+        }
+        /* get current speed setting */
+        state = speedstep_get_state();
+        speed = speedstep_freqs[state].frequency;
+        dprintk("currently at %s speed setting - %i MHz\n",
+                (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high",
+                (speed / 1000));
+        /* cpuinfo and default policy values */
+        policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+        policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+        policy->cur = speed;
+        result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
+        if (result)
+                return (result);
+        cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
+        return 0;
+}
+static int speedstep_cpu_exit(struct cpufreq_policy *policy)
+{
+        cpufreq_frequency_table_put_attr(policy->cpu);
+        return 0;
+}
+static unsigned int speedstep_get(unsigned int cpu)
+{
+        if (cpu)
+                return -ENODEV;
+        return speedstep_get_processor_frequency(speedstep_processor);
+}
+static int speedstep_resume(struct cpufreq_policy *policy)
+{
+        int result = speedstep_smi_ownership();
+        if (result)
+                dprintk("fails in re-aquiring ownership of a SMI interface.\n");
+        return result;
+}
+static struct freq_attr* speedstep_attr[] = {
+        &cpufreq_freq_attr_scaling_available_freqs,
+        NULL,
+};
+static struct cpufreq_driver speedstep_driver = {
+        .name           = "speedstep-smi",
+        .verify         = speedstep_verify,
+        .target         = speedstep_target,
+        .init           = speedstep_cpu_init,
+        .exit           = speedstep_cpu_exit,
+        .get            = speedstep_get,
+        .resume         = speedstep_resume,
+        .owner          = THIS_MODULE,
+        .attr           = speedstep_attr,
+};
+/**
+ * speedstep_init - initializes the SpeedStep CPUFreq driver
+ *
+ *   Initializes the SpeedStep support. Returns -ENODEV on unsupported
+ * BIOS, -EINVAL on problems during initiatization, and zero on
+ * success.
+ */
+static int __init speedstep_init(void)
+{
+        speedstep_processor = speedstep_detect_processor();
+        switch (speedstep_processor) {
+        case SPEEDSTEP_PROCESSOR_PIII_T:
+        case SPEEDSTEP_PROCESSOR_PIII_C:
+        case SPEEDSTEP_PROCESSOR_PIII_C_EARLY:
+                break;
+        default:
+                speedstep_processor = 0;
+        }
+        if (!speedstep_processor) {
+                dprintk ("No supported Intel CPU detected.\n");
+                return -ENODEV;
+        }
+        dprintk("signature:0x%.8lx, command:0x%.8lx, event:0x%.8lx, perf_level:0x%.8lx.\n",
+                ist_info.signature, ist_info.command, ist_info.event, ist_info.perf_level);
+        /* Error if no IST-SMI BIOS or no PARM
+                 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
+        if ((ist_info.signature !=  0x47534943) && (
+            (smi_port == 0) || (smi_cmd == 0)))
+                return -ENODEV;
+        if (smi_sig == 1)
+                smi_sig = 0x47534943;
+        else
+                smi_sig = ist_info.signature;
+        /* setup smi_port from MODLULE_PARM or BIOS */
+        if ((smi_port > 0xff) || (smi_port < 0))
+                return -EINVAL;
+        else if (smi_port == 0)
+                smi_port = ist_info.command & 0xff;
+        if ((smi_cmd > 0xff) || (smi_cmd < 0))
+                return -EINVAL;
+        else if (smi_cmd == 0)
+                smi_cmd = (ist_info.command >> 16) & 0xff;
+        return cpufreq_register_driver(&speedstep_driver);
+}
+/**
+ * speedstep_exit - unregisters SpeedStep support
+ *
+ *   Unregisters SpeedStep support.
+ */
+static void __exit speedstep_exit(void)
+{
+        cpufreq_unregister_driver(&speedstep_driver);
+}
+module_param(smi_port,  int, 0444);
+module_param(smi_cmd,   int, 0444);
+module_param(smi_sig,  uint, 0444);
+MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value -- Intel's default setting is 0xb2");
+MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value -- Intel's default setting is 0x82");
+MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the SMI interface.");
+MODULE_AUTHOR ("Hiroshi Miura");
+MODULE_DESCRIPTION ("Speedstep driver for IST applet SMI interface.");
+MODULE_LICENSE ("GPL");
+module_init(speedstep_init);
+module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
new file mode 100644
index 000000000000..122d2d75aa9f
--- /dev/null
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -0,0 +1,463 @@
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <asm/dma.h>
+#include <asm/io.h>
+#include <asm/processor-cyrix.h>
+#include <asm/timer.h>
+#include <asm/pci-direct.h>
+#include <asm/tsc.h>
+#include "cpu.h"
+/*
+ * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
+ */
+static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+{
+        unsigned char ccr2, ccr3;
+        unsigned long flags;
+        
+        /* we test for DEVID by checking whether CCR3 is writable */
+        local_irq_save(flags);
+        ccr3 = getCx86(CX86_CCR3);
+        setCx86(CX86_CCR3, ccr3 ^ 0x80);
+        getCx86(0xc0);   /* dummy to change bus */
+        if (getCx86(CX86_CCR3) == ccr3) {       /* no DEVID regs. */
+                ccr2 = getCx86(CX86_CCR2);
+                setCx86(CX86_CCR2, ccr2 ^ 0x04);
+                getCx86(0xc0);  /* dummy */
+                if (getCx86(CX86_CCR2) == ccr2) /* old Cx486SLC/DLC */
+                        *dir0 = 0xfd;
+                else {                          /* Cx486S A step */
+                        setCx86(CX86_CCR2, ccr2);
+                        *dir0 = 0xfe;
+                }
+        }
+        else {
+                setCx86(CX86_CCR3, ccr3);  /* restore CCR3 */
+                /* read DIR0 and DIR1 CPU registers */
+                *dir0 = getCx86(CX86_DIR0);
+                *dir1 = getCx86(CX86_DIR1);
+        }
+        local_irq_restore(flags);
+}
+/*
+ * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
+ * order to identify the Cyrix CPU model after we're out of setup.c
+ *
+ * Actually since bugs.h doesn't even reference this perhaps someone should
+ * fix the documentation ???
+ */
+static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
+static char Cx86_model[][9] __cpuinitdata = {
+        "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
+        "M II ", "Unknown"
+};
+static char Cx486_name[][5] __cpuinitdata = {
+        "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
+        "SRx2", "DRx2"
+};
+static char Cx486S_name[][4] __cpuinitdata = {
+        "S", "S2", "Se", "S2e"
+};
+static char Cx486D_name[][4] __cpuinitdata = {
+        "DX", "DX2", "?", "?", "?", "DX4"
+};
+static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
+static char cyrix_model_mult1[] __cpuinitdata = "12??43";
+static char cyrix_model_mult2[] __cpuinitdata = "12233445";
+/*
+ * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
+ * BIOSes for compatibility with DOS games.  This makes the udelay loop
+ * work correctly, and improves performance.
+ *
+ * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP
+ */
+extern void calibrate_delay(void) __init;
+static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
+{
+        unsigned long flags;
+        
+        if (Cx86_dir0_msb == 3) {
+                unsigned char ccr3, ccr5;
+                local_irq_save(flags);
+                ccr3 = getCx86(CX86_CCR3);
+                setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN  */
+                ccr5 = getCx86(CX86_CCR5);
+                if (ccr5 & 2)
+                        setCx86(CX86_CCR5, ccr5 & 0xfd);  /* reset SLOP */
+                setCx86(CX86_CCR3, ccr3);                 /* disable MAPEN */
+                local_irq_restore(flags);
+                if (ccr5 & 2) { /* possible wrong calibration done */
+                        printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
+                        calibrate_delay();
+                        c->loops_per_jiffy = loops_per_jiffy;
+                }
+        }
+}
+static void __cpuinit set_cx86_reorder(void)
+{
+        u8 ccr3;
+        printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
+        ccr3 = getCx86(CX86_CCR3);
+        setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN� */
+        /* Load/Store Serialize to mem access disable (=reorder it)� */
+        setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80);
+        /* set load/store serialize from 1GB to 4GB */
+        ccr3 |= 0xe0;
+        setCx86(CX86_CCR3, ccr3);
+}
+static void __cpuinit set_cx86_memwb(void)
+{
+        u32 cr0;
+        printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
+        /* CCR2 bit 2: unlock NW bit */
+        setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
+        /* set 'Not Write-through' */
+        cr0 = 0x20000000;
+        write_cr0(read_cr0() | cr0);
+        /* CCR2 bit 2: lock NW bit and set WT1 */
+        setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
+}
+static void __cpuinit set_cx86_inc(void)
+{
+        unsigned char ccr3;
+        printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n");
+        ccr3 = getCx86(CX86_CCR3);
+        setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN� */
+        /* PCR1 -- Performance Control */
+        /* Incrementor on, whatever that is */
+        setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02);
+        /* PCR0 -- Performance Control */
+        /* Incrementor Margin 10 */
+        setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04); 
+        setCx86(CX86_CCR3, ccr3);       /* disable MAPEN */
+}
+/*
+ *      Configure later MediaGX and/or Geode processor.
+ */
+static void __cpuinit geode_configure(void)
+{
+        unsigned long flags;
+        u8 ccr3;
+        local_irq_save(flags);
+        /* Suspend on halt power saving and enable #SUSP pin */
+        setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
+        ccr3 = getCx86(CX86_CCR3);
+        setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);       /* enable MAPEN */
+        
+        /* FPU fast, DTE cache, Mem bypass */
+        setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
+        setCx86(CX86_CCR3, ccr3);                       /* disable MAPEN */
+        
+        set_cx86_memwb();
+        set_cx86_reorder();     
+        set_cx86_inc();
+        
+        local_irq_restore(flags);
+}
+static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
+{
+        unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
+        char *buf = c->x86_model_id;
+        const char *p = NULL;
+        /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+           3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+        clear_bit(0*32+31, c->x86_capability);
+        /* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */
+        if ( test_bit(1*32+24, c->x86_capability) ) {
+                clear_bit(1*32+24, c->x86_capability);
+                set_bit(X86_FEATURE_CXMMX, c->x86_capability);
+        }
+        do_cyrix_devid(&dir0, &dir1);
+        check_cx686_slop(c);
+        Cx86_dir0_msb = dir0_msn = dir0 >> 4; /* identifies CPU "family"   */
+        dir0_lsn = dir0 & 0xf;                /* model or clock multiplier */
+        /* common case step number/rev -- exceptions handled below */
+        c->x86_model = (dir1 >> 4) + 1;
+        c->x86_mask = dir1 & 0xf;
+        /* Now cook; the original recipe is by Channing Corn, from Cyrix.
+         * We do the same thing for each generation: we work out
+         * the model, multiplier and stepping.  Black magic included,
+         * to make the silicon step/rev numbers match the printed ones.
+         */
+         
+        switch (dir0_msn) {
+                unsigned char tmp;
+        case 0: /* Cx486SLC/DLC/SRx/DRx */
+                p = Cx486_name[dir0_lsn & 7];
+                break;
+        case 1: /* Cx486S/DX/DX2/DX4 */
+                p = (dir0_lsn & 8) ? Cx486D_name[dir0_lsn & 5]
+                        : Cx486S_name[dir0_lsn & 3];
+                break;
+        case 2: /* 5x86 */
+                Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
+                p = Cx86_cb+2;
+                break;
+        case 3: /* 6x86/6x86L */
+                Cx86_cb[1] = ' ';
+                Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
+                if (dir1 > 0x21) { /* 686L */
+                        Cx86_cb[0] = 'L';
+                        p = Cx86_cb;
+                        (c->x86_model)++;
+                } else             /* 686 */
+                        p = Cx86_cb+1;
+                /* Emulate MTRRs using Cyrix's ARRs. */
+                set_bit(X86_FEATURE_CYRIX_ARR, c->x86_capability);
+                /* 6x86's contain this bug */
+                c->coma_bug = 1;
+                break;
+        case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
+#ifdef CONFIG_PCI
+        {
+                u32 vendor, device;
+                /* It isn't really a PCI quirk directly, but the cure is the
+                   same. The MediaGX has deep magic SMM stuff that handles the
+                   SB emulation. It thows away the fifo on disable_dma() which
+                   is wrong and ruins the audio. 
+                   Bug2: VSA1 has a wrap bug so that using maximum sized DMA 
+                   causes bad things. According to NatSemi VSA2 has another
+                   bug to do with 'hlt'. I've not seen any boards using VSA2
+                   and X doesn't seem to support it either so who cares 8).
+                   VSA1 we work around however.
+                */
+                printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
+                isa_dma_bridge_buggy = 2;
+                /* We do this before the PCI layer is running. However we
+                   are safe here as we know the bridge must be a Cyrix
+                   companion and must be present */
+                vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID);
+                device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID);
+                /*
+                 *  The 5510/5520 companion chips have a funky PIT.
+                 */  
+                if (vendor == PCI_VENDOR_ID_CYRIX &&
+         (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
+                        mark_tsc_unstable("cyrix 5510/5520 detected");
+        }
+#endif
+                c->x86_cache_size=16;   /* Yep 16K integrated cache thats it */
+                /* GXm supports extended cpuid levels 'ala' AMD */
+                if (c->cpuid_level == 2) {
+                        /* Enable cxMMX extensions (GX1 Datasheet 54) */
+                        setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
+                        
+                        /*
+                         * GXm : 0x30 ... 0x5f GXm  datasheet 51
+                         * GXlv: 0x6x          GXlv datasheet 54
+                         *  ?  : 0x7x
+                         * GX1 : 0x8x          GX1  datasheet 56
+                         */
+                        if((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <=dir1 && dir1 <= 0x8f))
+                                geode_configure();
+                        get_model_name(c);  /* get CPU marketing name */
+                        return;
+                }
+                else {  /* MediaGX */
+                        Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
+                        p = Cx86_cb+2;
+                        c->x86_model = (dir1 & 0x20) ? 1 : 2;
+                }
+                break;
+        case 5: /* 6x86MX/M II */
+                if (dir1 > 7)
+                {
+                        dir0_msn++;  /* M II */
+                        /* Enable MMX extensions (App note 108) */
+                        setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
+                }
+                else
+                {
+                        c->coma_bug = 1;      /* 6x86MX, it has the bug. */
+                }
+                tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0;
+                Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7];
+                p = Cx86_cb+tmp;
+                if (((dir1 & 0x0f) > 4) || ((dir1 & 0xf0) == 0x20))
+                        (c->x86_model)++;
+                /* Emulate MTRRs using Cyrix's ARRs. */
+                set_bit(X86_FEATURE_CYRIX_ARR, c->x86_capability);
+                break;
+        case 0xf:  /* Cyrix 486 without DEVID registers */
+                switch (dir0_lsn) {
+                case 0xd:  /* either a 486SLC or DLC w/o DEVID */
+                        dir0_msn = 0;
+                        p = Cx486_name[(c->hard_math) ? 1 : 0];
+                        break;
+                case 0xe:  /* a 486S A step */
+                        dir0_msn = 0;
+                        p = Cx486S_name[0];
+                        break;
+                }
+                break;
+        default:  /* unknown (shouldn't happen, we know everyone ;-) */
+                dir0_msn = 7;
+                break;
+        }
+        strcpy(buf, Cx86_model[dir0_msn & 7]);
+        if (p) strcat(buf, p);
+        return;
+}
+/*
+ * Handle National Semiconductor branded processors
+ */
+static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
+{
+        /* There may be GX1 processors in the wild that are branded
+         * NSC and not Cyrix.
+         *
+         * This function only handles the GX processor, and kicks every
+         * thing else to the Cyrix init function above - that should
+         * cover any processors that might have been branded differently
+         * after NSC acquired Cyrix.
+         *
+         * If this breaks your GX1 horribly, please e-mail
+         * info-linux@ldcmail.amd.com to tell us.
+         */
+        /* Handle the GX (Formally known as the GX2) */
+        if (c->x86 == 5 && c->x86_model == 5)
+                display_cacheinfo(c);
+        else
+                init_cyrix(c);
+}
+/*
+ * Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected
+ * by the fact that they preserve the flags across the division of 5/2.
+ * PII and PPro exhibit this behavior too, but they have cpuid available.
+ */
+ 
+/*
+ * Perform the Cyrix 5/2 test. A Cyrix won't change
+ * the flags, while other 486 chips will.
+ */
+static inline int test_cyrix_52div(void)
+{
+        unsigned int test;
+        __asm__ __volatile__(
+             "sahf\n\t"         /* clear flags (%eax = 0x0005) */
+             "div %b2\n\t"      /* divide 5 by 2 */
+             "lahf"             /* store flags into %ah */
+             : "=a" (test)
+             : "0" (5), "q" (2)
+             : "cc");
+        /* AH is 0x02 on Cyrix after the divide.. */
+        return (unsigned char) (test >> 8) == 0x02;
+}
+static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c)
+{
+        /* Detect Cyrix with disabled CPUID */
+        if ( c->x86 == 4 && test_cyrix_52div() ) {
+                unsigned char dir0, dir1;
+                
+                strcpy(c->x86_vendor_id, "CyrixInstead");
+                c->x86_vendor = X86_VENDOR_CYRIX;
+                
+                /* Actually enable cpuid on the older cyrix */
+            
+                /* Retrieve CPU revisions */
+                
+                do_cyrix_devid(&dir0, &dir1);
+                dir0>>=4;               
+                
+                /* Check it is an affected model */
+                
+                if (dir0 == 5 || dir0 == 3)
+                {
+                        unsigned char ccr3;
+                        unsigned long flags;
+                        printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
+                        local_irq_save(flags);
+                        ccr3 = getCx86(CX86_CCR3);
+                        setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);       /* enable MAPEN  */
+                        setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80);  /* enable cpuid  */
+                        setCx86(CX86_CCR3, ccr3);                       /* disable MAPEN */
+                        local_irq_restore(flags);
+                }
+        }
+}
+static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
+        .c_vendor       = "Cyrix",
+        .c_ident        = { "CyrixInstead" },
+        .c_init         = init_cyrix,
+        .c_identify     = cyrix_identify,
+};
+int __init cyrix_init_cpu(void)
+{
+        cpu_devs[X86_VENDOR_CYRIX] = &cyrix_cpu_dev;
+        return 0;
+}
+static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
+        .c_vendor       = "NSC",
+        .c_ident        = { "Geode by NSC" },
+        .c_init         = init_nsc,
+};
+int __init nsc_init_cpu(void)
+{
+        cpu_devs[X86_VENDOR_NSC] = &nsc_cpu_dev;
+        return 0;
+}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
new file mode 100644
index 000000000000..dc4e08147b1f
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel.c
@@ -0,0 +1,333 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include "cpu.h"
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <mach_apic.h>
+#endif
+extern int trap_init_f00f_bug(void);
+#ifdef CONFIG_X86_INTEL_USERCOPY
+/*
+ * Alignment at which movsl is preferred for bulk memory copies.
+ */
+struct movsl_mask movsl_mask __read_mostly;
+#endif
+void __cpuinit early_intel_workaround(struct cpuinfo_x86 *c)
+{
+        if (c->x86_vendor != X86_VENDOR_INTEL)
+                return;
+        /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
+        if (c->x86 == 15 && c->x86_cache_alignment == 64)
+                c->x86_cache_alignment = 128;
+}
+/*
+ *      Early probe support logic for ppro memory erratum #50
+ *
+ *      This is called before we do cpu ident work
+ */
+ 
+int __cpuinit ppro_with_ram_bug(void)
+{
+        /* Uses data from early_cpu_detect now */
+        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+            boot_cpu_data.x86 == 6 &&
+            boot_cpu_data.x86_model == 1 &&
+            boot_cpu_data.x86_mask < 8) {
+                printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
+                return 1;
+        }
+        return 0;
+}
+        
+/*
+ * P4 Xeon errata 037 workaround.
+ * Hardware prefetcher may cause stale data to be loaded into the cache.
+ */
+static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
+{
+        unsigned long lo, hi;
+        if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
+                rdmsr (MSR_IA32_MISC_ENABLE, lo, hi);
+                if ((lo & (1<<9)) == 0) {
+                        printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
+                        printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
+                        lo |= (1<<9);   /* Disable hw prefetching */
+                        wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
+                }
+        }
+}
+/*
+ * find out the number of processor cores on the die
+ */
+static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
+{
+        unsigned int eax, ebx, ecx, edx;
+        if (c->cpuid_level < 4)
+                return 1;
+        /* Intel has a non-standard dependency on %ecx for this CPUID level. */
+        cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
+        if (eax & 0x1f)
+                return ((eax >> 26) + 1);
+        else
+                return 1;
+}
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+{
+        unsigned int l2 = 0;
+        char *p = NULL;
+#ifdef CONFIG_X86_F00F_BUG
+        /*
+         * All current models of Pentium and Pentium with MMX technology CPUs
+         * have the F0 0F bug, which lets nonprivileged users lock up the system.
+         * Note that the workaround only should be initialized once...
+         */
+        c->f00f_bug = 0;
+        if (!paravirt_enabled() && c->x86 == 5) {
+                static int f00f_workaround_enabled = 0;
+                c->f00f_bug = 1;
+                if ( !f00f_workaround_enabled ) {
+                        trap_init_f00f_bug();
+                        printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
+                        f00f_workaround_enabled = 1;
+                }
+        }
+#endif
+        select_idle_routine(c);
+        l2 = init_intel_cacheinfo(c);
+        if (c->cpuid_level > 9 ) {
+                unsigned eax = cpuid_eax(10);
+                /* Check for version and the number of counters */
+                if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+                        set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability);
+        }
+        /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */
+        if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
+                clear_bit(X86_FEATURE_SEP, c->x86_capability);
+        /* Names for the Pentium II/Celeron processors 
+           detectable only by also checking the cache size.
+           Dixon is NOT a Celeron. */
+        if (c->x86 == 6) {
+                switch (c->x86_model) {
+                case 5:
+                        if (c->x86_mask == 0) {
+                                if (l2 == 0)
+                                        p = "Celeron (Covington)";
+                                else if (l2 == 256)
+                                        p = "Mobile Pentium II (Dixon)";
+                        }
+                        break;
+                        
+                case 6:
+                        if (l2 == 128)
+                                p = "Celeron (Mendocino)";
+                        else if (c->x86_mask == 0 || c->x86_mask == 5)
+                                p = "Celeron-A";
+                        break;
+                        
+                case 8:
+                        if (l2 == 128)
+                                p = "Celeron (Coppermine)";
+                        break;
+                }
+        }
+        if ( p )
+                strcpy(c->x86_model_id, p);
+        
+        c->x86_max_cores = num_cpu_cores(c);
+        detect_ht(c);
+        /* Work around errata */
+        Intel_errata_workarounds(c);
+#ifdef CONFIG_X86_INTEL_USERCOPY
+        /*
+         * Set up the preferred alignment for movsl bulk memory moves
+         */
+        switch (c->x86) {
+        case 4:         /* 486: untested */
+                break;
+        case 5:         /* Old Pentia: untested */
+                break;
+        case 6:         /* PII/PIII only like movsl with 8-byte alignment */
+                movsl_mask.mask = 7;
+                break;
+        case 15:        /* P4 is OK down to 8-byte alignment */
+                movsl_mask.mask = 7;
+                break;
+        }
+#endif
+        if (c->x86 == 15) {
+                set_bit(X86_FEATURE_P4, c->x86_capability);
+                set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
+        }
+        if (c->x86 == 6) 
+                set_bit(X86_FEATURE_P3, c->x86_capability);
+        if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+                (c->x86 == 0x6 && c->x86_model >= 0x0e))
+                set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
+        if (cpu_has_ds) {
+                unsigned int l1;
+                rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+                if (!(l1 & (1<<11)))
+                        set_bit(X86_FEATURE_BTS, c->x86_capability);
+                if (!(l1 & (1<<12)))
+                        set_bit(X86_FEATURE_PEBS, c->x86_capability);
+        }
+}
+static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+{
+        /* Intel PIII Tualatin. This comes in two flavours.
+         * One has 256kb of cache, the other 512. We have no way
+         * to determine which, so we use a boottime override
+         * for the 512kb model, and assume 256 otherwise.
+         */
+        if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
+                size = 256;
+        return size;
+}
+static struct cpu_dev intel_cpu_dev __cpuinitdata = {
+        .c_vendor       = "Intel",
+        .c_ident        = { "GenuineIntel" },
+        .c_models = {
+                { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = 
+                  { 
+                          [0] = "486 DX-25/33", 
+                          [1] = "486 DX-50", 
+                          [2] = "486 SX", 
+                          [3] = "486 DX/2", 
+                          [4] = "486 SL", 
+                          [5] = "486 SX/2", 
+                          [7] = "486 DX/2-WB", 
+                          [8] = "486 DX/4", 
+                          [9] = "486 DX/4-WB"
+                  }
+                },
+                { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names =
+                  { 
+                          [0] = "Pentium 60/66 A-step", 
+                          [1] = "Pentium 60/66", 
+                          [2] = "Pentium 75 - 200",
+                          [3] = "OverDrive PODP5V83", 
+                          [4] = "Pentium MMX",
+                          [7] = "Mobile Pentium 75 - 200", 
+                          [8] = "Mobile Pentium MMX"
+                  }
+                },
+                { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names =
+                  { 
+                          [0] = "Pentium Pro A-step",
+                          [1] = "Pentium Pro", 
+                          [3] = "Pentium II (Klamath)", 
+                          [4] = "Pentium II (Deschutes)", 
+                          [5] = "Pentium II (Deschutes)", 
+                          [6] = "Mobile Pentium II",
+                          [7] = "Pentium III (Katmai)", 
+                          [8] = "Pentium III (Coppermine)", 
+                          [10] = "Pentium III (Cascades)",
+                          [11] = "Pentium III (Tualatin)",
+                  }
+                },
+                { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names =
+                  {
+                          [0] = "Pentium 4 (Unknown)",
+                          [1] = "Pentium 4 (Willamette)",
+                          [2] = "Pentium 4 (Northwood)",
+                          [4] = "Pentium 4 (Foster)",
+                          [5] = "Pentium 4 (Foster)",
+                  }
+                },
+        },
+        .c_init         = init_intel,
+        .c_size_cache   = intel_size_cache,
+};
+__init int intel_cpu_init(void)
+{
+        cpu_devs[X86_VENDOR_INTEL] = &intel_cpu_dev;
+        return 0;
+}
+#ifndef CONFIG_X86_CMPXCHG
+unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
+{
+        u8 prev;
+        unsigned long flags;
+        /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+        local_irq_save(flags);
+        prev = *(u8 *)ptr;
+        if (prev == old)
+                *(u8 *)ptr = new;
+        local_irq_restore(flags);
+        return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u8);
+unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
+{
+        u16 prev;
+        unsigned long flags;
+        /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+        local_irq_save(flags);
+        prev = *(u16 *)ptr;
+        if (prev == old)
+                *(u16 *)ptr = new;
+        local_irq_restore(flags);
+        return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u16);
+unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
+{
+        u32 prev;
+        unsigned long flags;
+        /* Poor man's cmpxchg for 386. Unsuitable for SMP */
+        local_irq_save(flags);
+        prev = *(u32 *)ptr;
+        if (prev == old)
+                *(u32 *)ptr = new;
+        local_irq_restore(flags);
+        return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u32);
+#endif
+// arch_initcall(intel_cpu_init);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
new file mode 100644
index 000000000000..db6c25aa5776
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -0,0 +1,806 @@
+/*
+ *      Routines to indentify caches on Intel CPU.
+ *
+ *      Changes:
+ *      Venkatesh Pallipadi     : Adding cache identification through cpuid(4)
+ *              Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
+ *      Andi Kleen / Andreas Herrmann   : CPUID4 emulation on AMD.
+ */
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/compiler.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <asm/processor.h>
+#include <asm/smp.h>
+#define LVL_1_INST      1
+#define LVL_1_DATA      2
+#define LVL_2           3
+#define LVL_3           4
+#define LVL_TRACE       5
+struct _cache_table
+{
+        unsigned char descriptor;
+        char cache_type;
+        short size;
+};
+/* all the cache descriptor types we care about (no TLB or trace cache entries) */
+static struct _cache_table cache_table[] __cpuinitdata =
+{
+        { 0x06, LVL_1_INST, 8 },        /* 4-way set assoc, 32 byte line size */
+        { 0x08, LVL_1_INST, 16 },       /* 4-way set assoc, 32 byte line size */
+        { 0x0a, LVL_1_DATA, 8 },        /* 2 way set assoc, 32 byte line size */
+        { 0x0c, LVL_1_DATA, 16 },       /* 4-way set assoc, 32 byte line size */
+        { 0x22, LVL_3,      512 },      /* 4-way set assoc, sectored cache, 64 byte line size */
+        { 0x23, LVL_3,      1024 },     /* 8-way set assoc, sectored cache, 64 byte line size */
+        { 0x25, LVL_3,      2048 },     /* 8-way set assoc, sectored cache, 64 byte line size */
+        { 0x29, LVL_3,      4096 },     /* 8-way set assoc, sectored cache, 64 byte line size */
+        { 0x2c, LVL_1_DATA, 32 },       /* 8-way set assoc, 64 byte line size */
+        { 0x30, LVL_1_INST, 32 },       /* 8-way set assoc, 64 byte line size */
+        { 0x39, LVL_2,      128 },      /* 4-way set assoc, sectored cache, 64 byte line size */
+        { 0x3a, LVL_2,      192 },      /* 6-way set assoc, sectored cache, 64 byte line size */
+        { 0x3b, LVL_2,      128 },      /* 2-way set assoc, sectored cache, 64 byte line size */
+        { 0x3c, LVL_2,      256 },      /* 4-way set assoc, sectored cache, 64 byte line size */
+        { 0x3d, LVL_2,      384 },      /* 6-way set assoc, sectored cache, 64 byte line size */
+        { 0x3e, LVL_2,      512 },      /* 4-way set assoc, sectored cache, 64 byte line size */
+        { 0x41, LVL_2,      128 },      /* 4-way set assoc, 32 byte line size */
+        { 0x42, LVL_2,      256 },      /* 4-way set assoc, 32 byte line size */
+        { 0x43, LVL_2,      512 },      /* 4-way set assoc, 32 byte line size */
+        { 0x44, LVL_2,      1024 },     /* 4-way set assoc, 32 byte line size */
+        { 0x45, LVL_2,      2048 },     /* 4-way set assoc, 32 byte line size */
+        { 0x46, LVL_3,      4096 },     /* 4-way set assoc, 64 byte line size */
+        { 0x47, LVL_3,      8192 },     /* 8-way set assoc, 64 byte line size */
+        { 0x49, LVL_3,      4096 },     /* 16-way set assoc, 64 byte line size */
+        { 0x4a, LVL_3,      6144 },     /* 12-way set assoc, 64 byte line size */
+        { 0x4b, LVL_3,      8192 },     /* 16-way set assoc, 64 byte line size */
+        { 0x4c, LVL_3,     12288 },     /* 12-way set assoc, 64 byte line size */
+        { 0x4d, LVL_3,     16384 },     /* 16-way set assoc, 64 byte line size */
+        { 0x60, LVL_1_DATA, 16 },       /* 8-way set assoc, sectored cache, 64 byte line size */
+        { 0x66, LVL_1_DATA, 8 },        /* 4-way set assoc, sectored cache, 64 byte line size */
+        { 0x67, LVL_1_DATA, 16 },       /* 4-way set assoc, sectored cache, 64 byte line size */
+        { 0x68, LVL_1_DATA, 32 },       /* 4-way set assoc, sectored cache, 64 byte line size */
+        { 0x70, LVL_TRACE,  12 },       /* 8-way set assoc */
+        { 0x71, LVL_TRACE,  16 },       /* 8-way set assoc */
+        { 0x72, LVL_TRACE,  32 },       /* 8-way set assoc */
+        { 0x73, LVL_TRACE,  64 },       /* 8-way set assoc */
+        { 0x78, LVL_2,    1024 },       /* 4-way set assoc, 64 byte line size */
+        { 0x79, LVL_2,     128 },       /* 8-way set assoc, sectored cache, 64 byte line size */
+        { 0x7a, LVL_2,     256 },       /* 8-way set assoc, sectored cache, 64 byte line size */
+        { 0x7b, LVL_2,     512 },       /* 8-way set assoc, sectored cache, 64 byte line size */
+        { 0x7c, LVL_2,    1024 },       /* 8-way set assoc, sectored cache, 64 byte line size */
+        { 0x7d, LVL_2,    2048 },       /* 8-way set assoc, 64 byte line size */
+        { 0x7f, LVL_2,     512 },       /* 2-way set assoc, 64 byte line size */
+        { 0x82, LVL_2,     256 },       /* 8-way set assoc, 32 byte line size */
+        { 0x83, LVL_2,     512 },       /* 8-way set assoc, 32 byte line size */
+        { 0x84, LVL_2,    1024 },       /* 8-way set assoc, 32 byte line size */
+        { 0x85, LVL_2,    2048 },       /* 8-way set assoc, 32 byte line size */
+        { 0x86, LVL_2,     512 },       /* 4-way set assoc, 64 byte line size */
+        { 0x87, LVL_2,    1024 },       /* 8-way set assoc, 64 byte line size */
+        { 0x00, 0, 0}
+};
+enum _cache_type
+{
+        CACHE_TYPE_NULL = 0,
+        CACHE_TYPE_DATA = 1,
+        CACHE_TYPE_INST = 2,
+        CACHE_TYPE_UNIFIED = 3
+};
+union _cpuid4_leaf_eax {
+        struct {
+                enum _cache_type        type:5;
+                unsigned int            level:3;
+                unsigned int            is_self_initializing:1;
+                unsigned int            is_fully_associative:1;
+                unsigned int            reserved:4;
+                unsigned int            num_threads_sharing:12;
+                unsigned int            num_cores_on_die:6;
+        } split;
+        u32 full;
+};
+union _cpuid4_leaf_ebx {
+        struct {
+                unsigned int            coherency_line_size:12;
+                unsigned int            physical_line_partition:10;
+                unsigned int            ways_of_associativity:10;
+        } split;
+        u32 full;
+};
+union _cpuid4_leaf_ecx {
+        struct {
+                unsigned int            number_of_sets:32;
+        } split;
+        u32 full;
+};
+struct _cpuid4_info {
+        union _cpuid4_leaf_eax eax;
+        union _cpuid4_leaf_ebx ebx;
+        union _cpuid4_leaf_ecx ecx;
+        unsigned long size;
+        cpumask_t shared_cpu_map;
+};
+unsigned short                  num_cache_leaves;
+/* AMD doesn't have CPUID4. Emulate it here to report the same
+   information to the user.  This makes some assumptions about the machine:
+   L2 not shared, no SMT etc. that is currently true on AMD CPUs.
+   In theory the TLBs could be reported as fake type (they are in "dummy").
+   Maybe later */
+union l1_cache {
+        struct {
+                unsigned line_size : 8;
+                unsigned lines_per_tag : 8;
+                unsigned assoc : 8;
+                unsigned size_in_kb : 8;
+        };
+        unsigned val;
+};
+union l2_cache {
+        struct {
+                unsigned line_size : 8;
+                unsigned lines_per_tag : 4;
+                unsigned assoc : 4;
+                unsigned size_in_kb : 16;
+        };
+        unsigned val;
+};
+union l3_cache {
+        struct {
+                unsigned line_size : 8;
+                unsigned lines_per_tag : 4;
+                unsigned assoc : 4;
+                unsigned res : 2;
+                unsigned size_encoded : 14;
+        };
+        unsigned val;
+};
+static const unsigned short assocs[] = {
+        [1] = 1, [2] = 2, [4] = 4, [6] = 8,
+        [8] = 16, [0xa] = 32, [0xb] = 48,
+        [0xc] = 64,
+        [0xf] = 0xffff // ??
+};
+static const unsigned char levels[] = { 1, 1, 2, 3 };
+static const unsigned char types[] = { 1, 2, 3, 3 };
+static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
+                       union _cpuid4_leaf_ebx *ebx,
+                       union _cpuid4_leaf_ecx *ecx)
+{
+        unsigned dummy;
+        unsigned line_size, lines_per_tag, assoc, size_in_kb;
+        union l1_cache l1i, l1d;
+        union l2_cache l2;
+        union l3_cache l3;
+        union l1_cache *l1 = &l1d;
+        eax->full = 0;
+        ebx->full = 0;
+        ecx->full = 0;
+        cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
+        cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
+        switch (leaf) {
+        case 1:
+                l1 = &l1i;
+        case 0:
+                if (!l1->val)
+                        return;
+                assoc = l1->assoc;
+                line_size = l1->line_size;
+                lines_per_tag = l1->lines_per_tag;
+                size_in_kb = l1->size_in_kb;
+                break;
+        case 2:
+                if (!l2.val)
+                        return;
+                assoc = l2.assoc;
+                line_size = l2.line_size;
+                lines_per_tag = l2.lines_per_tag;
+                /* cpu_data has errata corrections for K7 applied */
+                size_in_kb = current_cpu_data.x86_cache_size;
+                break;
+        case 3:
+                if (!l3.val)
+                        return;
+                assoc = l3.assoc;
+                line_size = l3.line_size;
+                lines_per_tag = l3.lines_per_tag;
+                size_in_kb = l3.size_encoded * 512;
+                break;
+        default:
+                return;
+        }
+        eax->split.is_self_initializing = 1;
+        eax->split.type = types[leaf];
+        eax->split.level = levels[leaf];
+        if (leaf == 3)
+                eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1;
+        else
+                eax->split.num_threads_sharing = 0;
+        eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
+        if (assoc == 0xf)
+                eax->split.is_fully_associative = 1;
+        ebx->split.coherency_line_size = line_size - 1;
+        ebx->split.ways_of_associativity = assocs[assoc] - 1;
+        ebx->split.physical_line_partition = lines_per_tag - 1;
+        ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
+                (ebx->split.ways_of_associativity + 1) - 1;
+}
+static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
+{
+        union _cpuid4_leaf_eax  eax;
+        union _cpuid4_leaf_ebx  ebx;
+        union _cpuid4_leaf_ecx  ecx;
+        unsigned                edx;
+        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+                amd_cpuid4(index, &eax, &ebx, &ecx);
+        else
+                cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full,  &edx);
+        if (eax.split.type == CACHE_TYPE_NULL)
+                return -EIO; /* better error ? */
+        this_leaf->eax = eax;
+        this_leaf->ebx = ebx;
+        this_leaf->ecx = ecx;
+        this_leaf->size = (ecx.split.number_of_sets + 1) *
+                (ebx.split.coherency_line_size + 1) *
+                (ebx.split.physical_line_partition + 1) *
+                (ebx.split.ways_of_associativity + 1);
+        return 0;
+}
+static int __cpuinit find_num_cache_leaves(void)
+{
+        unsigned int            eax, ebx, ecx, edx;
+        union _cpuid4_leaf_eax  cache_eax;
+        int                     i = -1;
+        do {
+                ++i;
+                /* Do cpuid(4) loop to find out num_cache_leaves */
+                cpuid_count(4, i, &eax, &ebx, &ecx, &edx);
+                cache_eax.full = eax;
+        } while (cache_eax.split.type != CACHE_TYPE_NULL);
+        return i;
+}
+unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
+{
+        unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
+        unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
+        unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
+        unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
+#ifdef CONFIG_X86_HT
+        unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
+#endif
+        if (c->cpuid_level > 3) {
+                static int is_initialized;
+                if (is_initialized == 0) {
+                        /* Init num_cache_leaves from boot CPU */
+                        num_cache_leaves = find_num_cache_leaves();
+                        is_initialized++;
+                }
+                /*
+                 * Whenever possible use cpuid(4), deterministic cache
+                 * parameters cpuid leaf to find the cache details
+                 */
+                for (i = 0; i < num_cache_leaves; i++) {
+                        struct _cpuid4_info this_leaf;
+                        int retval;
+                        retval = cpuid4_cache_lookup(i, &this_leaf);
+                        if (retval >= 0) {
+                                switch(this_leaf.eax.split.level) {
+                                    case 1:
+                                        if (this_leaf.eax.split.type ==
+                                                        CACHE_TYPE_DATA)
+                                                new_l1d = this_leaf.size/1024;
+                                        else if (this_leaf.eax.split.type ==
+                                                        CACHE_TYPE_INST)
+                                                new_l1i = this_leaf.size/1024;
+                                        break;
+                                    case 2:
+                                        new_l2 = this_leaf.size/1024;
+                                        num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+                                        index_msb = get_count_order(num_threads_sharing);
+                                        l2_id = c->apicid >> index_msb;
+                                        break;
+                                    case 3:
+                                        new_l3 = this_leaf.size/1024;
+                                        num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+                                        index_msb = get_count_order(num_threads_sharing);
+                                        l3_id = c->apicid >> index_msb;
+                                        break;
+                                    default:
+                                        break;
+                                }
+                        }
+                }
+        }
+        /*
+         * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for
+         * trace cache
+         */
+        if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) {
+                /* supports eax=2  call */
+                int i, j, n;
+                int regs[4];
+                unsigned char *dp = (unsigned char *)regs;
+                int only_trace = 0;
+                if (num_cache_leaves != 0 && c->x86 == 15)
+                        only_trace = 1;
+                /* Number of times to iterate */
+                n = cpuid_eax(2) & 0xFF;
+                for ( i = 0 ; i < n ; i++ ) {
+                        cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
+                        /* If bit 31 is set, this is an unknown format */
+                        for ( j = 0 ; j < 3 ; j++ ) {
+                                if ( regs[j] < 0 ) regs[j] = 0;
+                        }
+                        /* Byte 0 is level count, not a descriptor */
+                        for ( j = 1 ; j < 16 ; j++ ) {
+                                unsigned char des = dp[j];
+                                unsigned char k = 0;
+                                /* look up this descriptor in the table */
+                                while (cache_table[k].descriptor != 0)
+                                {
+                                        if (cache_table[k].descriptor == des) {
+                                                if (only_trace && cache_table[k].cache_type != LVL_TRACE)
+                                                        break;
+                                                switch (cache_table[k].cache_type) {
+                                                case LVL_1_INST:
+                                                        l1i += cache_table[k].size;
+                                                        break;
+                                                case LVL_1_DATA:
+                                                        l1d += cache_table[k].size;
+                                                        break;
+                                                case LVL_2:
+                                                        l2 += cache_table[k].size;
+                                                        break;
+                                                case LVL_3:
+                                                        l3 += cache_table[k].size;
+                                                        break;
+                                                case LVL_TRACE:
+                                                        trace += cache_table[k].size;
+                                                        break;
+                                                }
+                                                break;
+                                        }
+                                        k++;
+                                }
+                        }
+                }
+        }
+        if (new_l1d)
+                l1d = new_l1d;
+        if (new_l1i)
+                l1i = new_l1i;
+        if (new_l2) {
+                l2 = new_l2;
+#ifdef CONFIG_X86_HT
+                cpu_llc_id[cpu] = l2_id;
+#endif
+        }
+        if (new_l3) {
+                l3 = new_l3;
+#ifdef CONFIG_X86_HT
+                cpu_llc_id[cpu] = l3_id;
+#endif
+        }
+        if (trace)
+                printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
+        else if ( l1i )
+                printk (KERN_INFO "CPU: L1 I cache: %dK", l1i);
+        if (l1d)
+                printk(", L1 D cache: %dK\n", l1d);
+        else
+                printk("\n");
+        if (l2)
+                printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
+        if (l3)
+                printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
+        c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
+        return l2;
+}
+/* pointer to _cpuid4_info array (for each cache leaf) */
+static struct _cpuid4_info *cpuid4_info[NR_CPUS];
+#define CPUID4_INFO_IDX(x,y)    (&((cpuid4_info[x])[y]))
+#ifdef CONFIG_SMP
+static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
+{
+        struct _cpuid4_info     *this_leaf, *sibling_leaf;
+        unsigned long num_threads_sharing;
+        int index_msb, i;
+        struct cpuinfo_x86 *c = cpu_data;
+        this_leaf = CPUID4_INFO_IDX(cpu, index);
+        num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
+        if (num_threads_sharing == 1)
+                cpu_set(cpu, this_leaf->shared_cpu_map);
+        else {
+                index_msb = get_count_order(num_threads_sharing);
+                for_each_online_cpu(i) {
+                        if (c[i].apicid >> index_msb ==
+                            c[cpu].apicid >> index_msb) {
+                                cpu_set(i, this_leaf->shared_cpu_map);
+                                if (i != cpu && cpuid4_info[i])  {
+                                        sibling_leaf = CPUID4_INFO_IDX(i, index);
+                                        cpu_set(cpu, sibling_leaf->shared_cpu_map);
+                                }
+                        }
+                }
+        }
+}
+static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
+{
+        struct _cpuid4_info     *this_leaf, *sibling_leaf;
+        int sibling;
+        this_leaf = CPUID4_INFO_IDX(cpu, index);
+        for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) {
+                sibling_leaf = CPUID4_INFO_IDX(sibling, index); 
+                cpu_clear(cpu, sibling_leaf->shared_cpu_map);
+        }
+}
+#else
+static void __init cache_shared_cpu_map_setup(unsigned int cpu, int index) {}
+static void __init cache_remove_shared_cpu_map(unsigned int cpu, int index) {}
+#endif
+static void free_cache_attributes(unsigned int cpu)
+{
+        kfree(cpuid4_info[cpu]);
+        cpuid4_info[cpu] = NULL;
+}
+static int __cpuinit detect_cache_attributes(unsigned int cpu)
+{
+        struct _cpuid4_info     *this_leaf;
+        unsigned long           j;
+        int                     retval;
+        cpumask_t               oldmask;
+        if (num_cache_leaves == 0)
+                return -ENOENT;
+        cpuid4_info[cpu] = kzalloc(
+            sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
+        if (cpuid4_info[cpu] == NULL)
+                return -ENOMEM;
+        oldmask = current->cpus_allowed;
+        retval = set_cpus_allowed(current, cpumask_of_cpu(cpu));
+        if (retval)
+                goto out;
+        /* Do cpuid and store the results */
+        retval = 0;
+        for (j = 0; j < num_cache_leaves; j++) {
+                this_leaf = CPUID4_INFO_IDX(cpu, j);
+                retval = cpuid4_cache_lookup(j, this_leaf);
+                if (unlikely(retval < 0))
+                        break;
+                cache_shared_cpu_map_setup(cpu, j);
+        }
+        set_cpus_allowed(current, oldmask);
+out:
+        if (retval)
+                free_cache_attributes(cpu);
+        return retval;
+}
+#ifdef CONFIG_SYSFS
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
+/* pointer to kobject for cpuX/cache */
+static struct kobject * cache_kobject[NR_CPUS];
+struct _index_kobject {
+        struct kobject kobj;
+        unsigned int cpu;
+        unsigned short index;
+};
+/* pointer to array of kobjects for cpuX/cache/indexY */
+static struct _index_kobject *index_kobject[NR_CPUS];
+#define INDEX_KOBJECT_PTR(x,y)    (&((index_kobject[x])[y]))
+#define show_one_plus(file_name, object, val)                           \
+static ssize_t show_##file_name                                         \
+                        (struct _cpuid4_info *this_leaf, char *buf)     \
+{                                                                       \
+        return sprintf (buf, "%lu\n", (unsigned long)this_leaf->object + val); \
+}
+show_one_plus(level, eax.split.level, 0);
+show_one_plus(coherency_line_size, ebx.split.coherency_line_size, 1);
+show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1);
+show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1);
+show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
+static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf)
+{
+        return sprintf (buf, "%luK\n", this_leaf->size / 1024);
+}
+static ssize_t show_shared_cpu_map(struct _cpuid4_info *this_leaf, char *buf)
+{
+        char mask_str[NR_CPUS];
+        cpumask_scnprintf(mask_str, NR_CPUS, this_leaf->shared_cpu_map);
+        return sprintf(buf, "%s\n", mask_str);
+}
+static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
+        switch(this_leaf->eax.split.type) {
+            case CACHE_TYPE_DATA:
+                return sprintf(buf, "Data\n");
+                break;
+            case CACHE_TYPE_INST:
+                return sprintf(buf, "Instruction\n");
+                break;
+            case CACHE_TYPE_UNIFIED:
+                return sprintf(buf, "Unified\n");
+                break;
+            default:
+                return sprintf(buf, "Unknown\n");
+                break;
+        }
+}
+struct _cache_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct _cpuid4_info *, char *);
+        ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
+};
+#define define_one_ro(_name) \
+static struct _cache_attr _name = \
+        __ATTR(_name, 0444, show_##_name, NULL)
+define_one_ro(level);
+define_one_ro(type);
+define_one_ro(coherency_line_size);
+define_one_ro(physical_line_partition);
+define_one_ro(ways_of_associativity);
+define_one_ro(number_of_sets);
+define_one_ro(size);
+define_one_ro(shared_cpu_map);
+static struct attribute * default_attrs[] = {
+        &type.attr,
+        &level.attr,
+        &coherency_line_size.attr,
+        &physical_line_partition.attr,
+        &ways_of_associativity.attr,
+        &number_of_sets.attr,
+        &size.attr,
+        &shared_cpu_map.attr,
+        NULL
+};
+#define to_object(k) container_of(k, struct _index_kobject, kobj)
+#define to_attr(a) container_of(a, struct _cache_attr, attr)
+static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
+{
+        struct _cache_attr *fattr = to_attr(attr);
+        struct _index_kobject *this_leaf = to_object(kobj);
+        ssize_t ret;
+        ret = fattr->show ?
+                fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
+                        buf) :
+                0;
+        return ret;
+}
+static ssize_t store(struct kobject * kobj, struct attribute * attr,
+                     const char * buf, size_t count)
+{
+        return 0;
+}
+static struct sysfs_ops sysfs_ops = {
+        .show   = show,
+        .store  = store,
+};
+static struct kobj_type ktype_cache = {
+        .sysfs_ops      = &sysfs_ops,
+        .default_attrs  = default_attrs,
+};
+static struct kobj_type ktype_percpu_entry = {
+        .sysfs_ops      = &sysfs_ops,
+};
+static void cpuid4_cache_sysfs_exit(unsigned int cpu)
+{
+        kfree(cache_kobject[cpu]);
+        kfree(index_kobject[cpu]);
+        cache_kobject[cpu] = NULL;
+        index_kobject[cpu] = NULL;
+        free_cache_attributes(cpu);
+}
+static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
+{
+        if (num_cache_leaves == 0)
+                return -ENOENT;
+        detect_cache_attributes(cpu);
+        if (cpuid4_info[cpu] == NULL)
+                return -ENOENT;
+        /* Allocate all required memory */
+        cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+        if (unlikely(cache_kobject[cpu] == NULL))
+                goto err_out;
+        index_kobject[cpu] = kzalloc(
+            sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
+        if (unlikely(index_kobject[cpu] == NULL))
+                goto err_out;
+        return 0;
+err_out:
+        cpuid4_cache_sysfs_exit(cpu);
+        return -ENOMEM;
+}
+/* Add/Remove cache interface for CPU device */
+static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
+{
+        unsigned int cpu = sys_dev->id;
+        unsigned long i, j;
+        struct _index_kobject *this_object;
+        int retval = 0;
+        retval = cpuid4_cache_sysfs_init(cpu);
+        if (unlikely(retval < 0))
+                return retval;
+        cache_kobject[cpu]->parent = &sys_dev->kobj;
+        kobject_set_name(cache_kobject[cpu], "%s", "cache");
+        cache_kobject[cpu]->ktype = &ktype_percpu_entry;
+        retval = kobject_register(cache_kobject[cpu]);
+        for (i = 0; i < num_cache_leaves; i++) {
+                this_object = INDEX_KOBJECT_PTR(cpu,i);
+                this_object->cpu = cpu;
+                this_object->index = i;
+                this_object->kobj.parent = cache_kobject[cpu];
+                kobject_set_name(&(this_object->kobj), "index%1lu", i);
+                this_object->kobj.ktype = &ktype_cache;
+                retval = kobject_register(&(this_object->kobj));
+                if (unlikely(retval)) {
+                        for (j = 0; j < i; j++) {
+                                kobject_unregister(
+                                        &(INDEX_KOBJECT_PTR(cpu,j)->kobj));
+                        }
+                        kobject_unregister(cache_kobject[cpu]);
+                        cpuid4_cache_sysfs_exit(cpu);
+                        break;
+                }
+        }
+        return retval;
+}
+static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
+{
+        unsigned int cpu = sys_dev->id;
+        unsigned long i;
+        if (cpuid4_info[cpu] == NULL)
+                return;
+        for (i = 0; i < num_cache_leaves; i++) {
+                cache_remove_shared_cpu_map(cpu, i);
+                kobject_unregister(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
+        }
+        kobject_unregister(cache_kobject[cpu]);
+        cpuid4_cache_sysfs_exit(cpu);
+        return;
+}
+static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
+                                        unsigned long action, void *hcpu)
+{
+        unsigned int cpu = (unsigned long)hcpu;
+        struct sys_device *sys_dev;
+        sys_dev = get_cpu_sysdev(cpu);
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                cache_add_dev(sys_dev);
+                break;
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                cache_remove_dev(sys_dev);
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier =
+{
+    .notifier_call = cacheinfo_cpu_callback,
+};
+static int __cpuinit cache_sysfs_init(void)
+{
+        int i;
+        if (num_cache_leaves == 0)
+                return 0;
+        register_hotcpu_notifier(&cacheinfo_cpu_notifier);
+        for_each_online_cpu(i) {
+                cacheinfo_cpu_callback(&cacheinfo_cpu_notifier, CPU_ONLINE,
+                        (void *)(long)i);
+        }
+        return 0;
+}
+device_initcall(cache_sysfs_init);
+#endif
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
new file mode 100644
index 000000000000..f1ebe1c1c17a
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -0,0 +1,2 @@
+obj-y   =       mce.o k7.o p4.o p5.o p6.o winchip.o therm_throt.o
+obj-$(CONFIG_X86_MCE_NONFATAL)  +=      non-fatal.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
new file mode 100644
index 000000000000..eef63e3630c2
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -0,0 +1,102 @@
+/*
+ * Athlon/Hammer specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk>
+ */
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/smp.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/msr.h>
+#include "mce.h"
+/* Machine Check Handler For AMD Athlon/Duron */
+static fastcall void k7_machine_check(struct pt_regs * regs, long error_code)
+{
+        int recover=1;
+        u32 alow, ahigh, high, low;
+        u32 mcgstl, mcgsth;
+        int i;
+        rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+        if (mcgstl & (1<<0))    /* Recoverable ? */
+                recover=0;
+        printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
+                smp_processor_id(), mcgsth, mcgstl);
+        for (i=1; i<nr_mce_banks; i++) {
+                rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
+                if (high&(1<<31)) {
+                        if (high & (1<<29))
+                                recover |= 1;
+                        if (high & (1<<25))
+                                recover |= 2;
+                        printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
+                        high &= ~(1<<31);
+                        if (high & (1<<27)) {
+                                rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
+                                printk ("[%08x%08x]", ahigh, alow);
+                        }
+                        if (high & (1<<26)) {
+                                rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
+                                printk (" at %08x%08x", ahigh, alow);
+                        }
+                        printk ("\n");
+                        /* Clear it */
+                        wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+                        /* Serialize */
+                        wmb();
+                        add_taint(TAINT_MACHINE_CHECK);
+                }
+        }
+        if (recover&2)
+                panic ("CPU context corrupt");
+        if (recover&1)
+                panic ("Unable to continue");
+        printk (KERN_EMERG "Attempting to continue.\n");
+        mcgstl &= ~(1<<2);
+        wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+}
+/* AMD K7 machine check is Intel like */
+void amd_mcheck_init(struct cpuinfo_x86 *c)
+{
+        u32 l, h;
+        int i;
+        if (!cpu_has(c, X86_FEATURE_MCE))
+                return;
+        machine_check_vector = k7_machine_check;
+        wmb();
+        printk (KERN_INFO "Intel machine check architecture supported.\n");
+        rdmsr (MSR_IA32_MCG_CAP, l, h);
+        if (l & (1<<8)) /* Control register present ? */
+                wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+        nr_mce_banks = l & 0xff;
+        /* Clear status for MC index 0 separately, we don't touch CTL,
+         * as some K7 Athlons cause spurious MCEs when its enabled. */
+        if (boot_cpu_data.x86 == 6) {
+                wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
+                i = 1;
+        } else
+                i = 0;
+        for (; i<nr_mce_banks; i++) {
+                wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
+                wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
+        }
+        set_in_cr4 (X86_CR4_MCE);
+        printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+                smp_processor_id());
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
new file mode 100644
index 000000000000..34c781eddee4
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,90 @@
+/*
+ * mce.c - x86 Machine Check Exception Reporting
+ * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
+ */
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/mce.h>
+#include "mce.h"
+int mce_disabled = 0;
+int nr_mce_banks;
+EXPORT_SYMBOL_GPL(nr_mce_banks);        /* non-fatal.o */
+/* Handle unconfigured int18 (should never happen) */
+static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code)
+{       
+        printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
+}
+/* Call the installed machine check handler for this CPU setup. */
+void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
+/* This has to be run for each processor */
+void mcheck_init(struct cpuinfo_x86 *c)
+{
+        if (mce_disabled==1)
+                return;
+        switch (c->x86_vendor) {
+                case X86_VENDOR_AMD:
+                        amd_mcheck_init(c);
+                        break;
+                case X86_VENDOR_INTEL:
+                        if (c->x86==5)
+                                intel_p5_mcheck_init(c);
+                        if (c->x86==6)
+                                intel_p6_mcheck_init(c);
+                        if (c->x86==15)
+                                intel_p4_mcheck_init(c);
+                        break;
+                case X86_VENDOR_CENTAUR:
+                        if (c->x86==5)
+                                winchip_mcheck_init(c);
+                        break;
+                default:
+                        break;
+        }
+}
+static unsigned long old_cr4 __initdata;
+void __init stop_mce(void)
+{
+        old_cr4 = read_cr4();
+        clear_in_cr4(X86_CR4_MCE);
+}
+void __init restart_mce(void)
+{
+        if (old_cr4 & X86_CR4_MCE)
+                set_in_cr4(X86_CR4_MCE);
+}
+static int __init mcheck_disable(char *str)
+{
+        mce_disabled = 1;
+        return 1;
+}
+static int __init mcheck_enable(char *str)
+{
+        mce_disabled = -1;
+        return 1;
+}
+__setup("nomce", mcheck_disable);
+__setup("mce", mcheck_enable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
new file mode 100644
index 000000000000..81fb6e2d35f3
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -0,0 +1,14 @@
+#include <linux/init.h>
+#include <asm/mce.h>
+void amd_mcheck_init(struct cpuinfo_x86 *c);
+void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
+void winchip_mcheck_init(struct cpuinfo_x86 *c);
+/* Call the installed machine check handler for this CPU setup. */
+extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code);
+extern int nr_mce_banks;
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
new file mode 100644
index 000000000000..bf39409b3838
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -0,0 +1,91 @@
+/*
+ * Non Fatal Machine Check Exception Reporting
+ *
+ * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk>
+ *
+ * This file contains routines to check for non-fatal MCEs every 15s
+ *
+ */
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/msr.h>
+#include "mce.h"
+static int firstbank;
+#define MCE_RATE        15*HZ   /* timer rate is 15s */
+static void mce_checkregs (void *info)
+{
+        u32 low, high;
+        int i;
+        for (i=firstbank; i<nr_mce_banks; i++) {
+                rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
+                if (high & (1<<31)) {
+                        printk(KERN_INFO "MCE: The hardware reports a non "
+                                "fatal, correctable incident occurred on "
+                                "CPU %d.\n",
+                                smp_processor_id());
+                        printk (KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
+                        /* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
+                        wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+                        /* Serialize */
+                        wmb();
+                        add_taint(TAINT_MACHINE_CHECK);
+                }
+        }
+}
+static void mce_work_fn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
+static void mce_work_fn(struct work_struct *work)
+{ 
+        on_each_cpu(mce_checkregs, NULL, 1, 1);
+        schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
+} 
+static int __init init_nonfatal_mce_checker(void)
+{
+        struct cpuinfo_x86 *c = &boot_cpu_data;
+        /* Check for MCE support */
+        if (!cpu_has(c, X86_FEATURE_MCE))
+                return -ENODEV;
+        /* Check for PPro style MCA */
+        if (!cpu_has(c, X86_FEATURE_MCA))
+                return -ENODEV;
+        /* Some Athlons misbehave when we frob bank 0 */
+        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+                boot_cpu_data.x86 == 6)
+                        firstbank = 1;
+        else
+                        firstbank = 0;
+        /*
+         * Check for non-fatal errors every MCE_RATE s
+         */
+        schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
+        printk(KERN_INFO "Machine check exception polling timer started.\n");
+        return 0;
+}
+module_init(init_nonfatal_mce_checker);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
new file mode 100644
index 000000000000..1509edfb2313
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -0,0 +1,253 @@
+/*
+ * P4 specific Machine Check Exception Reporting
+ */
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/smp.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <asm/therm_throt.h>
+#include "mce.h"
+/* as supported by the P4/Xeon family */
+struct intel_mce_extended_msrs {
+        u32 eax;
+        u32 ebx;
+        u32 ecx;
+        u32 edx;
+        u32 esi;
+        u32 edi;
+        u32 ebp;
+        u32 esp;
+        u32 eflags;
+        u32 eip;
+        /* u32 *reserved[]; */
+};
+static int mce_num_extended_msrs = 0;
+#ifdef CONFIG_X86_MCE_P4THERMAL
+static void unexpected_thermal_interrupt(struct pt_regs *regs)
+{       
+        printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
+                        smp_processor_id());
+        add_taint(TAINT_MACHINE_CHECK);
+}
+/* P4/Xeon Thermal transition interrupt handler */
+static void intel_thermal_interrupt(struct pt_regs *regs)
+{
+        __u64 msr_val;
+        ack_APIC_irq();
+        rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+        therm_throt_process(msr_val & 0x1);
+}
+/* Thermal interrupt handler for this CPU setup */
+static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
+fastcall void smp_thermal_interrupt(struct pt_regs *regs)
+{
+        irq_enter();
+        vendor_thermal_interrupt(regs);
+        irq_exit();
+}
+/* P4/Xeon Thermal regulation detect and init */
+static void intel_init_thermal(struct cpuinfo_x86 *c)
+{
+        u32 l, h;
+        unsigned int cpu = smp_processor_id();
+        /* Thermal monitoring */
+        if (!cpu_has(c, X86_FEATURE_ACPI))
+                return; /* -ENODEV */
+        /* Clock modulation */
+        if (!cpu_has(c, X86_FEATURE_ACC))
+                return; /* -ENODEV */
+        /* first check if its enabled already, in which case there might
+         * be some SMM goo which handles it, so we can't even put a handler
+         * since it might be delivered via SMI already -zwanem.
+         */
+        rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+        h = apic_read(APIC_LVTTHMR);
+        if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
+                printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
+                                cpu);
+                return; /* -EBUSY */
+        }
+        /* check whether a vector already exists, temporarily masked? */        
+        if (h & APIC_VECTOR_MASK) {
+                printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
+                                "installed\n",
+                        cpu, (h & APIC_VECTOR_MASK));
+                return; /* -EBUSY */
+        }
+        /* The temperature transition interrupt handler setup */
+        h = THERMAL_APIC_VECTOR;                /* our delivery vector */
+        h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
+        apic_write_around(APIC_LVTTHMR, h);
+        rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
+        wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
+        /* ok we're good to go... */
+        vendor_thermal_interrupt = intel_thermal_interrupt;
+        
+        rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+        wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
+        l = apic_read (APIC_LVTTHMR);
+        apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+        printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
+        /* enable thermal throttle processing */
+        atomic_set(&therm_throt_en, 1);
+        return;
+}
+#endif /* CONFIG_X86_MCE_P4THERMAL */
+/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
+static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
+{
+        u32 h;
+        rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
+        rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
+        rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
+        rdmsr (MSR_IA32_MCG_EDX, r->edx, h);
+        rdmsr (MSR_IA32_MCG_ESI, r->esi, h);
+        rdmsr (MSR_IA32_MCG_EDI, r->edi, h);
+        rdmsr (MSR_IA32_MCG_EBP, r->ebp, h);
+        rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
+        rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
+        rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
+}
+static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
+{
+        int recover=1;
+        u32 alow, ahigh, high, low;
+        u32 mcgstl, mcgsth;
+        int i;
+        rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+        if (mcgstl & (1<<0))    /* Recoverable ? */
+                recover=0;
+        printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
+                smp_processor_id(), mcgsth, mcgstl);
+        if (mce_num_extended_msrs > 0) {
+                struct intel_mce_extended_msrs dbg;
+                intel_get_extended_msrs(&dbg);
+                printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
+                        smp_processor_id(), dbg.eip, dbg.eflags);
+                printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
+                        dbg.eax, dbg.ebx, dbg.ecx, dbg.edx);
+                printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
+                        dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
+        }
+        for (i=0; i<nr_mce_banks; i++) {
+                rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
+                if (high & (1<<31)) {
+                        if (high & (1<<29))
+                                recover |= 1;
+                        if (high & (1<<25))
+                                recover |= 2;
+                        printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
+                        high &= ~(1<<31);
+                        if (high & (1<<27)) {
+                                rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
+                                printk ("[%08x%08x]", ahigh, alow);
+                        }
+                        if (high & (1<<26)) {
+                                rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
+                                printk (" at %08x%08x", ahigh, alow);
+                        }
+                        printk ("\n");
+                }
+        }
+        if (recover & 2)
+                panic ("CPU context corrupt");
+        if (recover & 1)
+                panic ("Unable to continue");
+        printk(KERN_EMERG "Attempting to continue.\n");
+        /* 
+         * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
+         * recoverable/continuable.This will allow BIOS to look at the MSRs
+         * for errors if the OS could not log the error.
+         */
+        for (i=0; i<nr_mce_banks; i++) {
+                u32 msr;
+                msr = MSR_IA32_MC0_STATUS+i*4;
+                rdmsr (msr, low, high);
+                if (high&(1<<31)) {
+                        /* Clear it */
+                        wrmsr(msr, 0UL, 0UL);
+                        /* Serialize */
+                        wmb();
+                        add_taint(TAINT_MACHINE_CHECK);
+                }
+        }
+        mcgstl &= ~(1<<2);
+        wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+}
+void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
+{
+        u32 l, h;
+        int i;
+        
+        machine_check_vector = intel_machine_check;
+        wmb();
+        printk (KERN_INFO "Intel machine check architecture supported.\n");
+        rdmsr (MSR_IA32_MCG_CAP, l, h);
+        if (l & (1<<8)) /* Control register present ? */
+                wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+        nr_mce_banks = l & 0xff;
+        for (i=0; i<nr_mce_banks; i++) {
+                wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
+                wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
+        }
+        set_in_cr4 (X86_CR4_MCE);
+        printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+                smp_processor_id());
+        /* Check for P4/Xeon extended MCE MSRs */
+        rdmsr (MSR_IA32_MCG_CAP, l, h);
+        if (l & (1<<9)) {/* MCG_EXT_P */
+                mce_num_extended_msrs = (l >> 16) & 0xff;
+                printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
+                                " available\n",
+                        smp_processor_id(), mce_num_extended_msrs);
+#ifdef CONFIG_X86_MCE_P4THERMAL
+                /* Check for P4/Xeon Thermal monitor */
+                intel_init_thermal(c);
+#endif
+        }
+}
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
new file mode 100644
index 000000000000..94bc43d950cf
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -0,0 +1,53 @@
+/*
+ * P5 specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Alan Cox <alan@redhat.com>
+ */
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/smp.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/msr.h>
+#include "mce.h"
+/* Machine check handler for Pentium class Intel */
+static fastcall void pentium_machine_check(struct pt_regs * regs, long error_code)
+{
+        u32 loaddr, hi, lotype;
+        rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
+        rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
+        printk(KERN_EMERG "CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype);
+        if(lotype&(1<<5))
+                printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id());
+        add_taint(TAINT_MACHINE_CHECK);
+}
+/* Set up machine check reporting for processors with Intel style MCE */
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
+{
+        u32 l, h;
+        
+        /*Check for MCE support */
+        if( !cpu_has(c, X86_FEATURE_MCE) )
+                return; 
+        /* Default P5 to off as its often misconnected */
+        if(mce_disabled != -1)
+                return;
+        machine_check_vector = pentium_machine_check;
+        wmb();
+        /* Read registers before enabling */
+        rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
+        rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
+        printk(KERN_INFO "Intel old style machine check architecture supported.\n");
+        /* Enable MCE */
+        set_in_cr4(X86_CR4_MCE);
+        printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
+}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
new file mode 100644
index 000000000000..deeae42ce199
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -0,0 +1,119 @@
+/*
+ * P6 specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Alan Cox <alan@redhat.com>
+ */
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/smp.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/msr.h>
+#include "mce.h"
+/* Machine Check Handler For PII/PIII */
+static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
+{
+        int recover=1;
+        u32 alow, ahigh, high, low;
+        u32 mcgstl, mcgsth;
+        int i;
+        rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+        if (mcgstl & (1<<0))    /* Recoverable ? */
+                recover=0;
+        printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
+                smp_processor_id(), mcgsth, mcgstl);
+        for (i=0; i<nr_mce_banks; i++) {
+                rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
+                if (high & (1<<31)) {
+                        if (high & (1<<29))
+                                recover |= 1;
+                        if (high & (1<<25))
+                                recover |= 2;
+                        printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
+                        high &= ~(1<<31);
+                        if (high & (1<<27)) {
+                                rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
+                                printk ("[%08x%08x]", ahigh, alow);
+                        }
+                        if (high & (1<<26)) {
+                                rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
+                                printk (" at %08x%08x", ahigh, alow);
+                        }
+                        printk ("\n");
+                }
+        }
+        if (recover & 2)
+                panic ("CPU context corrupt");
+        if (recover & 1)
+                panic ("Unable to continue");
+        printk (KERN_EMERG "Attempting to continue.\n");
+        /* 
+         * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
+         * recoverable/continuable.This will allow BIOS to look at the MSRs
+         * for errors if the OS could not log the error.
+         */
+        for (i=0; i<nr_mce_banks; i++) {
+                unsigned int msr;
+                msr = MSR_IA32_MC0_STATUS+i*4;
+                rdmsr (msr,low, high);
+                if (high & (1<<31)) {
+                        /* Clear it */
+                        wrmsr (msr, 0UL, 0UL);
+                        /* Serialize */
+                        wmb();
+                        add_taint(TAINT_MACHINE_CHECK);
+                }
+        }
+        mcgstl &= ~(1<<2);
+        wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+}
+/* Set up machine check reporting for processors with Intel style MCE */
+void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
+{
+        u32 l, h;
+        int i;
+        
+        /* Check for MCE support */
+        if (!cpu_has(c, X86_FEATURE_MCE))
+                return;
+        /* Check for PPro style MCA */
+        if (!cpu_has(c, X86_FEATURE_MCA))
+                return;
+        /* Ok machine check is available */
+        machine_check_vector = intel_machine_check;
+        wmb();
+        printk (KERN_INFO "Intel machine check architecture supported.\n");
+        rdmsr (MSR_IA32_MCG_CAP, l, h);
+        if (l & (1<<8)) /* Control register present ? */
+                wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+        nr_mce_banks = l & 0xff;
+        /*
+         * Following the example in IA-32 SDM Vol 3:
+         * - MC0_CTL should not be written
+         * - Status registers on all banks should be cleared on reset
+         */
+        for (i=1; i<nr_mce_banks; i++)
+                wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
+        for (i=0; i<nr_mce_banks; i++)
+                wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
+        set_in_cr4 (X86_CR4_MCE);
+        printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+                smp_processor_id());
+}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
new file mode 100644
index 000000000000..1203dc5ab87a
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -0,0 +1,186 @@
+/*
+ * linux/arch/i386/kernel/cpu/mcheck/therm_throt.c
+ *
+ * Thermal throttle event support code (such as syslog messaging and rate
+ * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
+ * This allows consistent reporting of CPU thermal throttle events.
+ *
+ * Maintains a counter in /sys that keeps track of the number of thermal
+ * events, such that the user knows how bad the thermal problem might be
+ * (since the logging to syslog and mcelog is rate limited).
+ *
+ * Author: Dmitriy Zavin (dmitriyz@google.com)
+ *
+ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
+ *          Inspired by Ross Biro's and Al Borchers' counter code.
+ */
+#include <linux/percpu.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <asm/cpu.h>
+#include <linux/notifier.h>
+#include <linux/jiffies.h>
+#include <asm/therm_throt.h>
+/* How long to wait between reporting thermal events */
+#define CHECK_INTERVAL              (300 * HZ)
+static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
+atomic_t therm_throt_en = ATOMIC_INIT(0);
+#ifdef CONFIG_SYSFS
+#define define_therm_throt_sysdev_one_ro(_name)                              \
+        static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+#define define_therm_throt_sysdev_show_func(name)                            \
+static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,        \
+                                              char *buf)                     \
+{                                                                            \
+        unsigned int cpu = dev->id;                                          \
+        ssize_t ret;                                                         \
+                                                                             \
+        preempt_disable();              /* CPU hotplug */                    \
+        if (cpu_online(cpu))                                                 \
+                ret = sprintf(buf, "%lu\n",                                  \
+                              per_cpu(thermal_throttle_##name, cpu));        \
+        else                                                                 \
+                ret = 0;                                                     \
+        preempt_enable();                                                    \
+                                                                             \
+        return ret;                                                          \
+}
+define_therm_throt_sysdev_show_func(count);
+define_therm_throt_sysdev_one_ro(count);
+static struct attribute *thermal_throttle_attrs[] = {
+        &attr_count.attr,
+        NULL
+};
+static struct attribute_group thermal_throttle_attr_group = {
+        .attrs = thermal_throttle_attrs,
+        .name = "thermal_throttle"
+};
+#endif /* CONFIG_SYSFS */
+/***
+ * therm_throt_process - Process thermal throttling event from interrupt
+ * @curr: Whether the condition is current or not (boolean), since the
+ *        thermal interrupt normally gets called both when the thermal
+ *        event begins and once the event has ended.
+ *
+ * This function is called by the thermal interrupt after the
+ * IRQ has been acknowledged.
+ *
+ * It will take care of rate limiting and printing messages to the syslog.
+ *
+ * Returns: 0 : Event should NOT be further logged, i.e. still in
+ *              "timeout" from previous log message.
+ *          1 : Event should be logged further, and a message has been
+ *              printed to the syslog.
+ */
+int therm_throt_process(int curr)
+{
+        unsigned int cpu = smp_processor_id();
+        __u64 tmp_jiffs = get_jiffies_64();
+        if (curr)
+                __get_cpu_var(thermal_throttle_count)++;
+        if (time_before64(tmp_jiffs, __get_cpu_var(next_check)))
+                return 0;
+        __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
+        /* if we just entered the thermal event */
+        if (curr) {
+                printk(KERN_CRIT "CPU%d: Temperature above threshold, "
+                       "cpu clock throttled (total events = %lu)\n", cpu,
+                       __get_cpu_var(thermal_throttle_count));
+                add_taint(TAINT_MACHINE_CHECK);
+        } else {
+                printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu);
+        }
+        return 1;
+}
+#ifdef CONFIG_SYSFS
+/* Add/Remove thermal_throttle interface for CPU device */
+static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
+{
+        return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+}
+static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
+{
+        return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+}
+/* Mutex protecting device creation against CPU hotplug */
+static DEFINE_MUTEX(therm_cpu_lock);
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
+                                                   unsigned long action,
+                                                   void *hcpu)
+{
+        unsigned int cpu = (unsigned long)hcpu;
+        struct sys_device *sys_dev;
+        int err;
+        sys_dev = get_cpu_sysdev(cpu);
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                mutex_lock(&therm_cpu_lock);
+                err = thermal_throttle_add_dev(sys_dev);
+                mutex_unlock(&therm_cpu_lock);
+                WARN_ON(err);
+                break;
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                mutex_lock(&therm_cpu_lock);
+                thermal_throttle_remove_dev(sys_dev);
+                mutex_unlock(&therm_cpu_lock);
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block thermal_throttle_cpu_notifier =
+{
+        .notifier_call = thermal_throttle_cpu_callback,
+};
+static __init int thermal_throttle_init_device(void)
+{
+        unsigned int cpu = 0;
+        int err;
+        if (!atomic_read(&therm_throt_en))
+                return 0;
+        register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
+#ifdef CONFIG_HOTPLUG_CPU
+        mutex_lock(&therm_cpu_lock);
+#endif
+        /* connect live CPUs to sysfs */
+        for_each_online_cpu(cpu) {
+                err = thermal_throttle_add_dev(get_cpu_sysdev(cpu));
+                WARN_ON(err);
+        }
+#ifdef CONFIG_HOTPLUG_CPU
+        mutex_unlock(&therm_cpu_lock);
+#endif
+        return 0;
+}
+device_initcall(thermal_throttle_init_device);
+#endif /* CONFIG_SYSFS */
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
new file mode 100644
index 000000000000..9e424b6c293d
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -0,0 +1,36 @@
+/*
+ * IDT Winchip specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Alan Cox <alan@redhat.com>
+ */
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/msr.h>
+#include "mce.h"
+/* Machine check handler for WinChip C6 */
+static fastcall void winchip_machine_check(struct pt_regs * regs, long error_code)
+{
+        printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
+        add_taint(TAINT_MACHINE_CHECK);
+}
+/* Set up machine check reporting on the Winchip C6 series */
+void winchip_mcheck_init(struct cpuinfo_x86 *c)
+{
+        u32 lo, hi;
+        machine_check_vector = winchip_machine_check;
+        wmb();
+        rdmsr(MSR_IDT_FCR1, lo, hi);
+        lo|= (1<<2);    /* Enable EIERRINT (int 18 MCE) */
+        lo&= ~(1<<4);   /* Enable MCE */
+        wrmsr(MSR_IDT_FCR1, lo, hi);
+        set_in_cr4(X86_CR4_MCE);
+        printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
+}
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
new file mode 100644
index 000000000000..191fc0533649
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -0,0 +1,3 @@
+obj-y           := main.o if.o generic.o state.o
+obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
new file mode 100644
index 000000000000..0949cdbf848a
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -0,0 +1,121 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+#include "mtrr.h"
+static void
+amd_get_mtrr(unsigned int reg, unsigned long *base,
+             unsigned long *size, mtrr_type * type)
+{
+        unsigned long low, high;
+        rdmsr(MSR_K6_UWCCR, low, high);
+        /*  Upper dword is region 1, lower is region 0  */
+        if (reg == 1)
+                low = high;
+        /*  The base masks off on the right alignment  */
+        *base = (low & 0xFFFE0000) >> PAGE_SHIFT;
+        *type = 0;
+        if (low & 1)
+                *type = MTRR_TYPE_UNCACHABLE;
+        if (low & 2)
+                *type = MTRR_TYPE_WRCOMB;
+        if (!(low & 3)) {
+                *size = 0;
+                return;
+        }
+        /*
+         *  This needs a little explaining. The size is stored as an
+         *  inverted mask of bits of 128K granularity 15 bits long offset
+         *  2 bits
+         *
+         *  So to get a size we do invert the mask and add 1 to the lowest
+         *  mask bit (4 as its 2 bits in). This gives us a size we then shift
+         *  to turn into 128K blocks
+         *
+         *  eg              111 1111 1111 1100      is 512K
+         *
+         *  invert          000 0000 0000 0011
+         *  +1              000 0000 0000 0100
+         *  *128K   ...
+         */
+        low = (~low) & 0x1FFFC;
+        *size = (low + 4) << (15 - PAGE_SHIFT);
+        return;
+}
+static void amd_set_mtrr(unsigned int reg, unsigned long base,
+                         unsigned long size, mtrr_type type)
+/*  [SUMMARY] Set variable MTRR register on the local CPU.
+    <reg> The register to set.
+    <base> The base address of the region.
+    <size> The size of the region. If this is 0 the region is disabled.
+    <type> The type of the region.
+    <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
+    be done externally.
+    [RETURNS] Nothing.
+*/
+{
+        u32 regs[2];
+        /*
+         *  Low is MTRR0 , High MTRR 1
+         */
+        rdmsr(MSR_K6_UWCCR, regs[0], regs[1]);
+        /*
+         *  Blank to disable
+         */
+        if (size == 0)
+                regs[reg] = 0;
+        else
+                /* Set the register to the base, the type (off by one) and an
+                   inverted bitmask of the size The size is the only odd
+                   bit. We are fed say 512K We invert this and we get 111 1111
+                   1111 1011 but if you subtract one and invert you get the   
+                   desired 111 1111 1111 1100 mask
+                   But ~(x - 1) == ~x + 1 == -x. Two's complement rocks!  */
+                regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC)
+                    | (base << PAGE_SHIFT) | (type + 1);
+        /*
+         *  The writeback rule is quite specific. See the manual. Its
+         *  disable local interrupts, write back the cache, set the mtrr
+         */
+        wbinvd();
+        wrmsr(MSR_K6_UWCCR, regs[0], regs[1]);
+}
+static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
+{
+        /* Apply the K6 block alignment and size rules
+           In order
+           o Uncached or gathering only
+           o 128K or bigger block
+           o Power of 2 block
+           o base suitably aligned to the power
+        */
+        if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT))
+            || (size & ~(size - 1)) - size || (base & (size - 1)))
+                return -EINVAL;
+        return 0;
+}
+static struct mtrr_ops amd_mtrr_ops = {
+        .vendor            = X86_VENDOR_AMD,
+        .set               = amd_set_mtrr,
+        .get               = amd_get_mtrr,
+        .get_free_region   = generic_get_free_region,
+        .validate_add_page = amd_validate_add_page,
+        .have_wrcomb       = positive_have_wrcomb,
+};
+int __init amd_init_mtrr(void)
+{
+        set_mtrr_ops(&amd_mtrr_ops);
+        return 0;
+}
+//arch_initcall(amd_mtrr_init);
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
new file mode 100644
index 000000000000..cb9aa3a7a7ab
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -0,0 +1,224 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+#include "mtrr.h"
+static struct {
+        unsigned long high;
+        unsigned long low;
+} centaur_mcr[8];
+static u8 centaur_mcr_reserved;
+static u8 centaur_mcr_type;     /* 0 for winchip, 1 for winchip2 */
+/*
+ *      Report boot time MCR setups 
+ */
+static int
+centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
+/*  [SUMMARY] Get a free MTRR.
+    <base> The starting (base) address of the region.
+    <size> The size (in bytes) of the region.
+    [RETURNS] The index of the region on success, else -1 on error.
+*/
+{
+        int i, max;
+        mtrr_type ltype;
+        unsigned long lbase, lsize;
+        max = num_var_ranges;
+        if (replace_reg >= 0 && replace_reg < max)
+                return replace_reg;
+        for (i = 0; i < max; ++i) {
+                if (centaur_mcr_reserved & (1 << i))
+                        continue;
+                mtrr_if->get(i, &lbase, &lsize, &ltype);
+                if (lsize == 0)
+                        return i;
+        }
+        return -ENOSPC;
+}
+void
+mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
+{
+        centaur_mcr[mcr].low = lo;
+        centaur_mcr[mcr].high = hi;
+}
+static void
+centaur_get_mcr(unsigned int reg, unsigned long *base,
+                unsigned long *size, mtrr_type * type)
+{
+        *base = centaur_mcr[reg].high >> PAGE_SHIFT;
+        *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
+        *type = MTRR_TYPE_WRCOMB;       /*  If it is there, it is write-combining  */
+        if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2))
+                *type = MTRR_TYPE_UNCACHABLE;
+        if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25)
+                *type = MTRR_TYPE_WRBACK;
+        if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31)
+                *type = MTRR_TYPE_WRBACK;
+}
+static void centaur_set_mcr(unsigned int reg, unsigned long base,
+                            unsigned long size, mtrr_type type)
+{
+        unsigned long low, high;
+        if (size == 0) {
+                /*  Disable  */
+                high = low = 0;
+        } else {
+                high = base << PAGE_SHIFT;
+                if (centaur_mcr_type == 0)
+                        low = -size << PAGE_SHIFT | 0x1f;       /* only support write-combining... */
+                else {
+                        if (type == MTRR_TYPE_UNCACHABLE)
+                                low = -size << PAGE_SHIFT | 0x02;       /* NC */
+                        else
+                                low = -size << PAGE_SHIFT | 0x09;       /* WWO,WC */
+                }
+        }
+        centaur_mcr[reg].high = high;
+        centaur_mcr[reg].low = low;
+        wrmsr(MSR_IDT_MCR0 + reg, low, high);
+}
+#if 0
+/*
+ *      Initialise the later (saner) Winchip MCR variant. In this version
+ *      the BIOS can pass us the registers it has used (but not their values)
+ *      and the control register is read/write
+ */
+static void __init
+centaur_mcr1_init(void)
+{
+        unsigned i;
+        u32 lo, hi;
+        /* Unfortunately, MCR's are read-only, so there is no way to
+         * find out what the bios might have done.
+         */
+        rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
+        if (((lo >> 17) & 7) == 1) {    /* Type 1 Winchip2 MCR */
+                lo &= ~0x1C0;   /* clear key */
+                lo |= 0x040;    /* set key to 1 */
+                wrmsr(MSR_IDT_MCR_CTRL, lo, hi);        /* unlock MCR */
+        }
+        centaur_mcr_type = 1;
+        /*
+         *  Clear any unconfigured MCR's.
+         */
+        for (i = 0; i < 8; ++i) {
+                if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) {
+                        if (!(lo & (1 << (9 + i))))
+                                wrmsr(MSR_IDT_MCR0 + i, 0, 0);
+                        else
+                                /*
+                                 *      If the BIOS set up an MCR we cannot see it
+                                 *      but we don't wish to obliterate it
+                                 */
+                                centaur_mcr_reserved |= (1 << i);
+                }
+        }
+        /*  
+         *  Throw the main write-combining switch... 
+         *  However if OOSTORE is enabled then people have already done far
+         *  cleverer things and we should behave. 
+         */
+        lo |= 15;               /* Write combine enables */
+        wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
+}
+/*
+ *      Initialise the original winchip with read only MCR registers
+ *      no used bitmask for the BIOS to pass on and write only control
+ */
+static void __init
+centaur_mcr0_init(void)
+{
+        unsigned i;
+        /* Unfortunately, MCR's are read-only, so there is no way to
+         * find out what the bios might have done.
+         */
+        /* Clear any unconfigured MCR's.
+         * This way we are sure that the centaur_mcr array contains the actual
+         * values. The disadvantage is that any BIOS tweaks are thus undone.
+         *
+         */
+        for (i = 0; i < 8; ++i) {
+                if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0)
+                        wrmsr(MSR_IDT_MCR0 + i, 0, 0);
+        }
+        wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */
+}
+/*
+ *      Initialise Winchip series MCR registers
+ */
+static void __init
+centaur_mcr_init(void)
+{
+        struct set_mtrr_context ctxt;
+        set_mtrr_prepare_save(&ctxt);
+        set_mtrr_cache_disable(&ctxt);
+        if (boot_cpu_data.x86_model == 4)
+                centaur_mcr0_init();
+        else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9)
+                centaur_mcr1_init();
+        set_mtrr_done(&ctxt);
+}
+#endif
+static int centaur_validate_add_page(unsigned long base, 
+                                     unsigned long size, unsigned int type)
+{
+        /*
+         *  FIXME: Winchip2 supports uncached
+         */
+        if (type != MTRR_TYPE_WRCOMB && 
+            (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
+                printk(KERN_WARNING
+                       "mtrr: only write-combining%s supported\n",
+                       centaur_mcr_type ? " and uncacheable are"
+                       : " is");
+                return -EINVAL;
+        }
+        return 0;
+}
+static struct mtrr_ops centaur_mtrr_ops = {
+        .vendor            = X86_VENDOR_CENTAUR,
+//      .init              = centaur_mcr_init,
+        .set               = centaur_set_mcr,
+        .get               = centaur_get_mcr,
+        .get_free_region   = centaur_get_free_region,
+        .validate_add_page = centaur_validate_add_page,
+        .have_wrcomb       = positive_have_wrcomb,
+};
+int __init centaur_init_mtrr(void)
+{
+        set_mtrr_ops(&centaur_mtrr_ops);
+        return 0;
+}
+//arch_initcall(centaur_init_mtrr);
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
new file mode 100644
index 000000000000..2287d4863a8a
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -0,0 +1,380 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/processor-cyrix.h>
+#include "mtrr.h"
+int arr3_protected;
+static void
+cyrix_get_arr(unsigned int reg, unsigned long *base,
+              unsigned long *size, mtrr_type * type)
+{
+        unsigned long flags;
+        unsigned char arr, ccr3, rcr, shift;
+        arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
+        /* Save flags and disable interrupts */
+        local_irq_save(flags);
+        ccr3 = getCx86(CX86_CCR3);
+        setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);       /* enable MAPEN */
+        ((unsigned char *) base)[3] = getCx86(arr);
+        ((unsigned char *) base)[2] = getCx86(arr + 1);
+        ((unsigned char *) base)[1] = getCx86(arr + 2);
+        rcr = getCx86(CX86_RCR_BASE + reg);
+        setCx86(CX86_CCR3, ccr3);       /* disable MAPEN */
+        /* Enable interrupts if it was enabled previously */
+        local_irq_restore(flags);
+        shift = ((unsigned char *) base)[1] & 0x0f;
+        *base >>= PAGE_SHIFT;
+        /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7
+         * Note: shift==0xf means 4G, this is unsupported.
+         */
+        if (shift)
+                *size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1);
+        else
+                *size = 0;
+        /* Bit 0 is Cache Enable on ARR7, Cache Disable on ARR0-ARR6 */
+        if (reg < 7) {
+                switch (rcr) {
+                case 1:
+                        *type = MTRR_TYPE_UNCACHABLE;
+                        break;
+                case 8:
+                        *type = MTRR_TYPE_WRBACK;
+                        break;
+                case 9:
+                        *type = MTRR_TYPE_WRCOMB;
+                        break;
+                case 24:
+                default:
+                        *type = MTRR_TYPE_WRTHROUGH;
+                        break;
+                }
+        } else {
+                switch (rcr) {
+                case 0:
+                        *type = MTRR_TYPE_UNCACHABLE;
+                        break;
+                case 8:
+                        *type = MTRR_TYPE_WRCOMB;
+                        break;
+                case 9:
+                        *type = MTRR_TYPE_WRBACK;
+                        break;
+                case 25:
+                default:
+                        *type = MTRR_TYPE_WRTHROUGH;
+                        break;
+                }
+        }
+}
+static int
+cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
+/*  [SUMMARY] Get a free ARR.
+    <base> The starting (base) address of the region.
+    <size> The size (in bytes) of the region.
+    [RETURNS] The index of the region on success, else -1 on error.
+*/
+{
+        int i;
+        mtrr_type ltype;
+        unsigned long lbase, lsize;
+        switch (replace_reg) {
+        case 7:
+                if (size < 0x40)
+                        break;
+        case 6:
+        case 5:
+        case 4:
+                return replace_reg;
+        case 3:
+                if (arr3_protected)
+                        break;
+        case 2:
+        case 1:
+        case 0:
+                return replace_reg;
+        }
+        /* If we are to set up a region >32M then look at ARR7 immediately */
+        if (size > 0x2000) {
+                cyrix_get_arr(7, &lbase, &lsize, &ltype);
+                if (lsize == 0)
+                        return 7;
+                /*  Else try ARR0-ARR6 first  */
+        } else {
+                for (i = 0; i < 7; i++) {
+                        cyrix_get_arr(i, &lbase, &lsize, &ltype);
+                        if ((i == 3) && arr3_protected)
+                                continue;
+                        if (lsize == 0)
+                                return i;
+                }
+                /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */
+                cyrix_get_arr(i, &lbase, &lsize, &ltype);
+                if ((lsize == 0) && (size >= 0x40))
+                        return i;
+        }
+        return -ENOSPC;
+}
+static u32 cr4 = 0;
+static u32 ccr3;
+static void prepare_set(void)
+{
+        u32 cr0;
+        /*  Save value of CR4 and clear Page Global Enable (bit 7)  */
+        if ( cpu_has_pge ) {
+                cr4 = read_cr4();
+                write_cr4(cr4 & ~X86_CR4_PGE);
+        }
+        /*  Disable and flush caches. Note that wbinvd flushes the TLBs as
+            a side-effect  */
+        cr0 = read_cr0() | 0x40000000;
+        wbinvd();
+        write_cr0(cr0);
+        wbinvd();
+        /* Cyrix ARRs - everything else were excluded at the top */
+        ccr3 = getCx86(CX86_CCR3);
+        /* Cyrix ARRs - everything else were excluded at the top */
+        setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
+}
+static void post_set(void)
+{
+        /*  Flush caches and TLBs  */
+        wbinvd();
+        /* Cyrix ARRs - everything else was excluded at the top */
+        setCx86(CX86_CCR3, ccr3);
+                
+        /*  Enable caches  */
+        write_cr0(read_cr0() & 0xbfffffff);
+        /*  Restore value of CR4  */
+        if ( cpu_has_pge )
+                write_cr4(cr4);
+}
+static void cyrix_set_arr(unsigned int reg, unsigned long base,
+                          unsigned long size, mtrr_type type)
+{
+        unsigned char arr, arr_type, arr_size;
+        arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
+        /* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */
+        if (reg >= 7)
+                size >>= 6;
+        size &= 0x7fff;         /* make sure arr_size <= 14 */
+        for (arr_size = 0; size; arr_size++, size >>= 1) ;
+        if (reg < 7) {
+                switch (type) {
+                case MTRR_TYPE_UNCACHABLE:
+                        arr_type = 1;
+                        break;
+                case MTRR_TYPE_WRCOMB:
+                        arr_type = 9;
+                        break;
+                case MTRR_TYPE_WRTHROUGH:
+                        arr_type = 24;
+                        break;
+                default:
+                        arr_type = 8;
+                        break;
+                }
+        } else {
+                switch (type) {
+                case MTRR_TYPE_UNCACHABLE:
+                        arr_type = 0;
+                        break;
+                case MTRR_TYPE_WRCOMB:
+                        arr_type = 8;
+                        break;
+                case MTRR_TYPE_WRTHROUGH:
+                        arr_type = 25;
+                        break;
+                default:
+                        arr_type = 9;
+                        break;
+                }
+        }
+        prepare_set();
+        base <<= PAGE_SHIFT;
+        setCx86(arr, ((unsigned char *) &base)[3]);
+        setCx86(arr + 1, ((unsigned char *) &base)[2]);
+        setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size);
+        setCx86(CX86_RCR_BASE + reg, arr_type);
+        post_set();
+}
+typedef struct {
+        unsigned long base;
+        unsigned long size;
+        mtrr_type type;
+} arr_state_t;
+static arr_state_t arr_state[8] = {
+        {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL},
+        {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}
+};
+static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 };
+static void cyrix_set_all(void)
+{
+        int i;
+        prepare_set();
+        /* the CCRs are not contiguous */
+        for (i = 0; i < 4; i++)
+                setCx86(CX86_CCR0 + i, ccr_state[i]);
+        for (; i < 7; i++)
+                setCx86(CX86_CCR4 + i, ccr_state[i]);
+        for (i = 0; i < 8; i++)
+                cyrix_set_arr(i, arr_state[i].base, 
+                              arr_state[i].size, arr_state[i].type);
+        post_set();
+}
+#if 0
+/*
+ * On Cyrix 6x86(MX) and M II the ARR3 is special: it has connection
+ * with the SMM (System Management Mode) mode. So we need the following:
+ * Check whether SMI_LOCK (CCR3 bit 0) is set
+ *   if it is set, write a warning message: ARR3 cannot be changed!
+ *     (it cannot be changed until the next processor reset)
+ *   if it is reset, then we can change it, set all the needed bits:
+ *   - disable access to SMM memory through ARR3 range (CCR1 bit 7 reset)
+ *   - disable access to SMM memory (CCR1 bit 2 reset)
+ *   - disable SMM mode (CCR1 bit 1 reset)
+ *   - disable write protection of ARR3 (CCR6 bit 1 reset)
+ *   - (maybe) disable ARR3
+ * Just to be sure, we enable ARR usage by the processor (CCR5 bit 5 set)
+ */
+static void __init
+cyrix_arr_init(void)
+{
+        struct set_mtrr_context ctxt;
+        unsigned char ccr[7];
+        int ccrc[7] = { 0, 0, 0, 0, 0, 0, 0 };
+#ifdef CONFIG_SMP
+        int i;
+#endif
+        /* flush cache and enable MAPEN */
+        set_mtrr_prepare_save(&ctxt);
+        set_mtrr_cache_disable(&ctxt);
+        /* Save all CCRs locally */
+        ccr[0] = getCx86(CX86_CCR0);
+        ccr[1] = getCx86(CX86_CCR1);
+        ccr[2] = getCx86(CX86_CCR2);
+        ccr[3] = ctxt.ccr3;
+        ccr[4] = getCx86(CX86_CCR4);
+        ccr[5] = getCx86(CX86_CCR5);
+        ccr[6] = getCx86(CX86_CCR6);
+        if (ccr[3] & 1) {
+                ccrc[3] = 1;
+                arr3_protected = 1;
+        } else {
+                /* Disable SMM mode (bit 1), access to SMM memory (bit 2) and
+                 * access to SMM memory through ARR3 (bit 7).
+                 */
+                if (ccr[1] & 0x80) {
+                        ccr[1] &= 0x7f;
+                        ccrc[1] |= 0x80;
+                }
+                if (ccr[1] & 0x04) {
+                        ccr[1] &= 0xfb;
+                        ccrc[1] |= 0x04;
+                }
+                if (ccr[1] & 0x02) {
+                        ccr[1] &= 0xfd;
+                        ccrc[1] |= 0x02;
+                }
+                arr3_protected = 0;
+                if (ccr[6] & 0x02) {
+                        ccr[6] &= 0xfd;
+                        ccrc[6] = 1;    /* Disable write protection of ARR3 */
+                        setCx86(CX86_CCR6, ccr[6]);
+                }
+                /* Disable ARR3. This is safe now that we disabled SMM. */
+                /* cyrix_set_arr_up (3, 0, 0, 0, FALSE); */
+        }
+        /* If we changed CCR1 in memory, change it in the processor, too. */
+        if (ccrc[1])
+                setCx86(CX86_CCR1, ccr[1]);
+        /* Enable ARR usage by the processor */
+        if (!(ccr[5] & 0x20)) {
+                ccr[5] |= 0x20;
+                ccrc[5] = 1;
+                setCx86(CX86_CCR5, ccr[5]);
+        }
+#ifdef CONFIG_SMP
+        for (i = 0; i < 7; i++)
+                ccr_state[i] = ccr[i];
+        for (i = 0; i < 8; i++)
+                cyrix_get_arr(i,
+                              &arr_state[i].base, &arr_state[i].size,
+                              &arr_state[i].type);
+#endif
+        set_mtrr_done(&ctxt);   /* flush cache and disable MAPEN */
+        if (ccrc[5])
+                printk(KERN_INFO "mtrr: ARR usage was not enabled, enabled manually\n");
+        if (ccrc[3])
+                printk(KERN_INFO "mtrr: ARR3 cannot be changed\n");
+/*
+    if ( ccrc[1] & 0x80) printk ("mtrr: SMM memory access through ARR3 disabled\n");
+    if ( ccrc[1] & 0x04) printk ("mtrr: SMM memory access disabled\n");
+    if ( ccrc[1] & 0x02) printk ("mtrr: SMM mode disabled\n");
+*/
+        if (ccrc[6])
+                printk(KERN_INFO "mtrr: ARR3 was write protected, unprotected\n");
+}
+#endif
+static struct mtrr_ops cyrix_mtrr_ops = {
+        .vendor            = X86_VENDOR_CYRIX,
+//      .init              = cyrix_arr_init,
+        .set_all           = cyrix_set_all,
+        .set               = cyrix_set_arr,
+        .get               = cyrix_get_arr,
+        .get_free_region   = cyrix_get_free_region,
+        .validate_add_page = generic_validate_add_page,
+        .have_wrcomb       = positive_have_wrcomb,
+};
+int __init cyrix_init_mtrr(void)
+{
+        set_mtrr_ops(&cyrix_mtrr_ops);
+        return 0;
+}
+//arch_initcall(cyrix_init_mtrr);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
new file mode 100644
index 000000000000..56f64e34829f
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -0,0 +1,509 @@
+/* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
+   because MTRRs can span upto 40 bits (36bits on most modern x86) */ 
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <asm/io.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+#include <asm/system.h>
+#include <asm/cpufeature.h>
+#include <asm/tlbflush.h>
+#include "mtrr.h"
+struct mtrr_state {
+        struct mtrr_var_range *var_ranges;
+        mtrr_type fixed_ranges[NUM_FIXED_RANGES];
+        unsigned char enabled;
+        unsigned char have_fixed;
+        mtrr_type def_type;
+};
+struct fixed_range_block {
+        int base_msr; /* start address of an MTRR block */
+        int ranges;   /* number of MTRRs in this block  */
+};
+static struct fixed_range_block fixed_range_blocks[] = {
+        { MTRRfix64K_00000_MSR, 1 }, /* one  64k MTRR  */
+        { MTRRfix16K_80000_MSR, 2 }, /* two  16k MTRRs */
+        { MTRRfix4K_C0000_MSR,  8 }, /* eight 4k MTRRs */
+        {}
+};
+static unsigned long smp_changes_mask;
+static struct mtrr_state mtrr_state = {};
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "mtrr."
+static int mtrr_show;
+module_param_named(show, mtrr_show, bool, 0);
+/*  Get the MSR pair relating to a var range  */
+static void
+get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
+{
+        rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
+        rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
+}
+static void
+get_fixed_ranges(mtrr_type * frs)
+{
+        unsigned int *p = (unsigned int *) frs;
+        int i;
+        rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
+        for (i = 0; i < 2; i++)
+                rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]);
+        for (i = 0; i < 8; i++)
+                rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
+}
+void mtrr_save_fixed_ranges(void *info)
+{
+        if (cpu_has_mtrr)
+                get_fixed_ranges(mtrr_state.fixed_ranges);
+}
+static void print_fixed(unsigned base, unsigned step, const mtrr_type*types)
+{
+        unsigned i;
+        for (i = 0; i < 8; ++i, ++types, base += step)
+                printk(KERN_INFO "MTRR %05X-%05X %s\n",
+                        base, base + step - 1, mtrr_attrib_to_str(*types));
+}
+/*  Grab all of the MTRR state for this CPU into *state  */
+void __init get_mtrr_state(void)
+{
+        unsigned int i;
+        struct mtrr_var_range *vrs;
+        unsigned lo, dummy;
+        if (!mtrr_state.var_ranges) {
+                mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range), 
+                                                GFP_KERNEL);
+                if (!mtrr_state.var_ranges)
+                        return;
+        } 
+        vrs = mtrr_state.var_ranges;
+        rdmsr(MTRRcap_MSR, lo, dummy);
+        mtrr_state.have_fixed = (lo >> 8) & 1;
+        for (i = 0; i < num_var_ranges; i++)
+                get_mtrr_var_range(i, &vrs[i]);
+        if (mtrr_state.have_fixed)
+                get_fixed_ranges(mtrr_state.fixed_ranges);
+        rdmsr(MTRRdefType_MSR, lo, dummy);
+        mtrr_state.def_type = (lo & 0xff);
+        mtrr_state.enabled = (lo & 0xc00) >> 10;
+        if (mtrr_show) {
+                int high_width;
+                printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
+                if (mtrr_state.have_fixed) {
+                        printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
+                               mtrr_state.enabled & 1 ? "en" : "dis");
+                        print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
+                        for (i = 0; i < 2; ++i)
+                                print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
+                        for (i = 0; i < 8; ++i)
+                                print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
+                }
+                printk(KERN_INFO "MTRR variable ranges %sabled:\n",
+                       mtrr_state.enabled & 2 ? "en" : "dis");
+                high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
+                for (i = 0; i < num_var_ranges; ++i) {
+                        if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
+                                printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+                                       i,
+                                       high_width,
+                                       mtrr_state.var_ranges[i].base_hi,
+                                       mtrr_state.var_ranges[i].base_lo >> 12,
+                                       high_width,
+                                       mtrr_state.var_ranges[i].mask_hi,
+                                       mtrr_state.var_ranges[i].mask_lo >> 12,
+                                       mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+                        else
+                                printk(KERN_INFO "MTRR %u disabled\n", i);
+                }
+        }
+}
+/*  Some BIOS's are fucked and don't set all MTRRs the same!  */
+void __init mtrr_state_warn(void)
+{
+        unsigned long mask = smp_changes_mask;
+        if (!mask)
+                return;
+        if (mask & MTRR_CHANGE_MASK_FIXED)
+                printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n");
+        if (mask & MTRR_CHANGE_MASK_VARIABLE)
+                printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n");
+        if (mask & MTRR_CHANGE_MASK_DEFTYPE)
+                printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n");
+        printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
+        printk(KERN_INFO "mtrr: corrected configuration.\n");
+}
+/* Doesn't attempt to pass an error out to MTRR users
+   because it's quite complicated in some cases and probably not
+   worth it because the best error handling is to ignore it. */
+void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
+{
+        if (wrmsr_safe(msr, a, b) < 0)
+                printk(KERN_ERR
+                        "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
+                        smp_processor_id(), msr, a, b);
+}
+/**
+ * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
+ * see AMD publication no. 24593, chapter 3.2.1 for more information
+ */
+static inline void k8_enable_fixed_iorrs(void)
+{
+        unsigned lo, hi;
+        rdmsr(MSR_K8_SYSCFG, lo, hi);
+        mtrr_wrmsr(MSR_K8_SYSCFG, lo
+                                | K8_MTRRFIXRANGE_DRAM_ENABLE
+                                | K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
+}
+/**
+ * Checks and updates an fixed-range MTRR if it differs from the value it
+ * should have. If K8 extenstions are wanted, update the K8 SYSCFG MSR also.
+ * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information
+ * \param msr MSR address of the MTTR which should be checked and updated
+ * \param changed pointer which indicates whether the MTRR needed to be changed
+ * \param msrwords pointer to the MSR values which the MSR should have
+ */
+static void set_fixed_range(int msr, int * changed, unsigned int * msrwords)
+{
+        unsigned lo, hi;
+        rdmsr(msr, lo, hi);
+        if (lo != msrwords[0] || hi != msrwords[1]) {
+                if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+                    boot_cpu_data.x86 == 15 &&
+                    ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
+                        k8_enable_fixed_iorrs();
+                mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
+                *changed = TRUE;
+        }
+}
+int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
+/*  [SUMMARY] Get a free MTRR.
+    <base> The starting (base) address of the region.
+    <size> The size (in bytes) of the region.
+    [RETURNS] The index of the region on success, else -1 on error.
+*/
+{
+        int i, max;
+        mtrr_type ltype;
+        unsigned long lbase, lsize;
+        max = num_var_ranges;
+        if (replace_reg >= 0 && replace_reg < max)
+                return replace_reg;
+        for (i = 0; i < max; ++i) {
+                mtrr_if->get(i, &lbase, &lsize, &ltype);
+                if (lsize == 0)
+                        return i;
+        }
+        return -ENOSPC;
+}
+static void generic_get_mtrr(unsigned int reg, unsigned long *base,
+                             unsigned long *size, mtrr_type *type)
+{
+        unsigned int mask_lo, mask_hi, base_lo, base_hi;
+        rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
+        if ((mask_lo & 0x800) == 0) {
+                /*  Invalid (i.e. free) range  */
+                *base = 0;
+                *size = 0;
+                *type = 0;
+                return;
+        }
+        rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
+        /* Work out the shifted address mask. */
+        mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT)
+            | mask_lo >> PAGE_SHIFT;
+        /* This works correctly if size is a power of two, i.e. a
+           contiguous range. */
+        *size = -mask_lo;
+        *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
+        *type = base_lo & 0xff;
+}
+/**
+ * Checks and updates the fixed-range MTRRs if they differ from the saved set
+ * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges()
+ */
+static int set_fixed_ranges(mtrr_type * frs)
+{
+        unsigned long long *saved = (unsigned long long *) frs;
+        int changed = FALSE;
+        int block=-1, range;
+        while (fixed_range_blocks[++block].ranges)
+            for (range=0; range < fixed_range_blocks[block].ranges; range++)
+                set_fixed_range(fixed_range_blocks[block].base_msr + range,
+                    &changed, (unsigned int *) saved++);
+        return changed;
+}
+/*  Set the MSR pair relating to a var range. Returns TRUE if
+    changes are made  */
+static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
+{
+        unsigned int lo, hi;
+        int changed = FALSE;
+        rdmsr(MTRRphysBase_MSR(index), lo, hi);
+        if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
+            || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
+                (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+                mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
+                changed = TRUE;
+        }
+        rdmsr(MTRRphysMask_MSR(index), lo, hi);
+        if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL)
+            || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
+                (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+                mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
+                changed = TRUE;
+        }
+        return changed;
+}
+static u32 deftype_lo, deftype_hi;
+static unsigned long set_mtrr_state(void)
+/*  [SUMMARY] Set the MTRR state for this CPU.
+    <state> The MTRR state information to read.
+    <ctxt> Some relevant CPU context.
+    [NOTE] The CPU must already be in a safe state for MTRR changes.
+    [RETURNS] 0 if no changes made, else a mask indication what was changed.
+*/
+{
+        unsigned int i;
+        unsigned long change_mask = 0;
+        for (i = 0; i < num_var_ranges; i++)
+                if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
+                        change_mask |= MTRR_CHANGE_MASK_VARIABLE;
+        if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
+                change_mask |= MTRR_CHANGE_MASK_FIXED;
+        /*  Set_mtrr_restore restores the old value of MTRRdefType,
+           so to set it we fiddle with the saved value  */
+        if ((deftype_lo & 0xff) != mtrr_state.def_type
+            || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
+                deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10);
+                change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
+        }
+        return change_mask;
+}
+static unsigned long cr4 = 0;
+static DEFINE_SPINLOCK(set_atomicity_lock);
+/*
+ * Since we are disabling the cache don't allow any interrupts - they
+ * would run extremely slow and would only increase the pain.  The caller must
+ * ensure that local interrupts are disabled and are reenabled after post_set()
+ * has been called.
+ */
+static void prepare_set(void) __acquires(set_atomicity_lock)
+{
+        unsigned long cr0;
+        /*  Note that this is not ideal, since the cache is only flushed/disabled
+           for this CPU while the MTRRs are changed, but changing this requires
+           more invasive changes to the way the kernel boots  */
+        spin_lock(&set_atomicity_lock);
+        /*  Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
+        cr0 = read_cr0() | 0x40000000;  /* set CD flag */
+        write_cr0(cr0);
+        wbinvd();
+        /*  Save value of CR4 and clear Page Global Enable (bit 7)  */
+        if ( cpu_has_pge ) {
+                cr4 = read_cr4();
+                write_cr4(cr4 & ~X86_CR4_PGE);
+        }
+        /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+        __flush_tlb();
+        /*  Save MTRR state */
+        rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+        /*  Disable MTRRs, and set the default type to uncached  */
+        mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
+}
+static void post_set(void) __releases(set_atomicity_lock)
+{
+        /*  Flush TLBs (no need to flush caches - they are disabled)  */
+        __flush_tlb();
+        /* Intel (P6) standard MTRRs */
+        mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+                
+        /*  Enable caches  */
+        write_cr0(read_cr0() & 0xbfffffff);
+        /*  Restore value of CR4  */
+        if ( cpu_has_pge )
+                write_cr4(cr4);
+        spin_unlock(&set_atomicity_lock);
+}
+static void generic_set_all(void)
+{
+        unsigned long mask, count;
+        unsigned long flags;
+        local_irq_save(flags);
+        prepare_set();
+        /* Actually set the state */
+        mask = set_mtrr_state();
+        post_set();
+        local_irq_restore(flags);
+        /*  Use the atomic bitops to update the global mask  */
+        for (count = 0; count < sizeof mask * 8; ++count) {
+                if (mask & 0x01)
+                        set_bit(count, &smp_changes_mask);
+                mask >>= 1;
+        }
+        
+}
+static void generic_set_mtrr(unsigned int reg, unsigned long base,
+                             unsigned long size, mtrr_type type)
+/*  [SUMMARY] Set variable MTRR register on the local CPU.
+    <reg> The register to set.
+    <base> The base address of the region.
+    <size> The size of the region. If this is 0 the region is disabled.
+    <type> The type of the region.
+    <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
+    be done externally.
+    [RETURNS] Nothing.
+*/
+{
+        unsigned long flags;
+        struct mtrr_var_range *vr;
+        vr = &mtrr_state.var_ranges[reg];
+        local_irq_save(flags);
+        prepare_set();
+        if (size == 0) {
+                /* The invalid bit is kept in the mask, so we simply clear the
+                   relevant mask register to disable a range. */
+                mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0);
+                memset(vr, 0, sizeof(struct mtrr_var_range));
+        } else {
+                vr->base_lo = base << PAGE_SHIFT | type;
+                vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT);
+                vr->mask_lo = -size << PAGE_SHIFT | 0x800;
+                vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT);
+                mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi);
+                mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
+        }
+        post_set();
+        local_irq_restore(flags);
+}
+int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
+{
+        unsigned long lbase, last;
+        /*  For Intel PPro stepping <= 7, must be 4 MiB aligned 
+            and not touch 0x70000000->0x7003FFFF */
+        if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
+            boot_cpu_data.x86_model == 1 &&
+            boot_cpu_data.x86_mask <= 7) {
+                if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
+                        printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
+                        return -EINVAL;
+                }
+                if (!(base + size < 0x70000 || base > 0x7003F) &&
+                    (type == MTRR_TYPE_WRCOMB
+                     || type == MTRR_TYPE_WRBACK)) {
+                        printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
+                        return -EINVAL;
+                }
+        }
+        /*  Check upper bits of base and last are equal and lower bits are 0
+            for base and 1 for last  */
+        last = base + size - 1;
+        for (lbase = base; !(lbase & 1) && (last & 1);
+             lbase = lbase >> 1, last = last >> 1) ;
+        if (lbase != last) {
+                printk(KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n",
+                       base, size);
+                return -EINVAL;
+        }
+        return 0;
+}
+static int generic_have_wrcomb(void)
+{
+        unsigned long config, dummy;
+        rdmsr(MTRRcap_MSR, config, dummy);
+        return (config & (1 << 10));
+}
+int positive_have_wrcomb(void)
+{
+        return 1;
+}
+/* generic structure...
+ */
+struct mtrr_ops generic_mtrr_ops = {
+        .use_intel_if      = 1,
+        .set_all           = generic_set_all,
+        .get               = generic_get_mtrr,
+        .get_free_region   = generic_get_free_region,
+        .set               = generic_set_mtrr,
+        .validate_add_page = generic_validate_add_page,
+        .have_wrcomb       = generic_have_wrcomb,
+};
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
new file mode 100644
index 000000000000..c7d8f1756745
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -0,0 +1,439 @@
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/capability.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+#define LINE_SIZE 80
+#include <asm/mtrr.h>
+#include "mtrr.h"
+/* RED-PEN: this is accessed without any locking */
+extern unsigned int *usage_table;
+#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
+static const char *const mtrr_strings[MTRR_NUM_TYPES] =
+{
+    "uncachable",               /* 0 */
+    "write-combining",          /* 1 */
+    "?",                        /* 2 */
+    "?",                        /* 3 */
+    "write-through",            /* 4 */
+    "write-protect",            /* 5 */
+    "write-back",               /* 6 */
+};
+const char *mtrr_attrib_to_str(int x)
+{
+        return (x <= 6) ? mtrr_strings[x] : "?";
+}
+#ifdef CONFIG_PROC_FS
+static int
+mtrr_file_add(unsigned long base, unsigned long size,
+              unsigned int type, char increment, struct file *file, int page)
+{
+        int reg, max;
+        unsigned int *fcount = FILE_FCOUNT(file); 
+        max = num_var_ranges;
+        if (fcount == NULL) {
+                fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
+                if (!fcount)
+                        return -ENOMEM;
+                FILE_FCOUNT(file) = fcount;
+        }
+        if (!page) {
+                if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
+                        return -EINVAL;
+                base >>= PAGE_SHIFT;
+                size >>= PAGE_SHIFT;
+        }
+        reg = mtrr_add_page(base, size, type, 1);
+        if (reg >= 0)
+                ++fcount[reg];
+        return reg;
+}
+static int
+mtrr_file_del(unsigned long base, unsigned long size,
+              struct file *file, int page)
+{
+        int reg;
+        unsigned int *fcount = FILE_FCOUNT(file);
+        if (!page) {
+                if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
+                        return -EINVAL;
+                base >>= PAGE_SHIFT;
+                size >>= PAGE_SHIFT;
+        }
+        reg = mtrr_del_page(-1, base, size);
+        if (reg < 0)
+                return reg;
+        if (fcount == NULL)
+                return reg;
+        if (fcount[reg] < 1)
+                return -EINVAL;
+        --fcount[reg];
+        return reg;
+}
+/* RED-PEN: seq_file can seek now. this is ignored. */
+static ssize_t
+mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
+/*  Format of control line:
+    "base=%Lx size=%Lx type=%s"     OR:
+    "disable=%d"
+*/
+{
+        int i, err;
+        unsigned long reg;
+        unsigned long long base, size;
+        char *ptr;
+        char line[LINE_SIZE];
+        size_t linelen;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (!len)
+                return -EINVAL;
+        memset(line, 0, LINE_SIZE);
+        if (len > LINE_SIZE)
+                len = LINE_SIZE;
+        if (copy_from_user(line, buf, len - 1))
+                return -EFAULT;
+        linelen = strlen(line);
+        ptr = line + linelen - 1;
+        if (linelen && *ptr == '\n')
+                *ptr = '\0';
+        if (!strncmp(line, "disable=", 8)) {
+                reg = simple_strtoul(line + 8, &ptr, 0);
+                err = mtrr_del_page(reg, 0, 0);
+                if (err < 0)
+                        return err;
+                return len;
+        }
+        if (strncmp(line, "base=", 5))
+                return -EINVAL;
+        base = simple_strtoull(line + 5, &ptr, 0);
+        for (; isspace(*ptr); ++ptr) ;
+        if (strncmp(ptr, "size=", 5))
+                return -EINVAL;
+        size = simple_strtoull(ptr + 5, &ptr, 0);
+        if ((base & 0xfff) || (size & 0xfff))
+                return -EINVAL;
+        for (; isspace(*ptr); ++ptr) ;
+        if (strncmp(ptr, "type=", 5))
+                return -EINVAL;
+        ptr += 5;
+        for (; isspace(*ptr); ++ptr) ;
+        for (i = 0; i < MTRR_NUM_TYPES; ++i) {
+                if (strcmp(ptr, mtrr_strings[i]))
+                        continue;
+                base >>= PAGE_SHIFT;
+                size >>= PAGE_SHIFT;
+                err =
+                    mtrr_add_page((unsigned long) base, (unsigned long) size, i,
+                                  1);
+                if (err < 0)
+                        return err;
+                return len;
+        }
+        return -EINVAL;
+}
+static long
+mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
+{
+        int err = 0;
+        mtrr_type type;
+        unsigned long size;
+        struct mtrr_sentry sentry;
+        struct mtrr_gentry gentry;
+        void __user *arg = (void __user *) __arg;
+        switch (cmd) {
+        case MTRRIOC_ADD_ENTRY:
+        case MTRRIOC_SET_ENTRY:
+        case MTRRIOC_DEL_ENTRY:
+        case MTRRIOC_KILL_ENTRY:
+        case MTRRIOC_ADD_PAGE_ENTRY:
+        case MTRRIOC_SET_PAGE_ENTRY:
+        case MTRRIOC_DEL_PAGE_ENTRY:
+        case MTRRIOC_KILL_PAGE_ENTRY:
+                if (copy_from_user(&sentry, arg, sizeof sentry))
+                        return -EFAULT;
+                break;
+        case MTRRIOC_GET_ENTRY:
+        case MTRRIOC_GET_PAGE_ENTRY:
+                if (copy_from_user(&gentry, arg, sizeof gentry))
+                        return -EFAULT;
+                break;
+#ifdef CONFIG_COMPAT
+        case MTRRIOC32_ADD_ENTRY:
+        case MTRRIOC32_SET_ENTRY:
+        case MTRRIOC32_DEL_ENTRY:
+        case MTRRIOC32_KILL_ENTRY:
+        case MTRRIOC32_ADD_PAGE_ENTRY:
+        case MTRRIOC32_SET_PAGE_ENTRY:
+        case MTRRIOC32_DEL_PAGE_ENTRY:
+        case MTRRIOC32_KILL_PAGE_ENTRY: {
+                struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)__arg;
+                err = get_user(sentry.base, &s32->base);
+                err |= get_user(sentry.size, &s32->size);
+                err |= get_user(sentry.type, &s32->type);
+                if (err)
+                        return err;
+                break;
+        }
+        case MTRRIOC32_GET_ENTRY:
+        case MTRRIOC32_GET_PAGE_ENTRY: {
+                struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg;
+                err = get_user(gentry.regnum, &g32->regnum);
+                err |= get_user(gentry.base, &g32->base);
+                err |= get_user(gentry.size, &g32->size);
+                err |= get_user(gentry.type, &g32->type);
+                if (err)
+                        return err;
+                break;
+        }
+#endif
+        }
+        switch (cmd) {
+        default:
+                return -ENOTTY;
+        case MTRRIOC_ADD_ENTRY:
+#ifdef CONFIG_COMPAT
+        case MTRRIOC32_ADD_ENTRY:
+#endif
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                err =
+                    mtrr_file_add(sentry.base, sentry.size, sentry.type, 1,
+                                  file, 0);
+                break;
+        case MTRRIOC_SET_ENTRY:
+#ifdef CONFIG_COMPAT
+        case MTRRIOC32_SET_ENTRY:
+#endif
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                err = mtrr_add(sentry.base, sentry.size, sentry.type, 0);
+                break;
+        case MTRRIOC_DEL_ENTRY:
+#ifdef CONFIG_COMPAT
+        case MTRRIOC32_DEL_ENTRY:
+#endif
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                err = mtrr_file_del(sentry.base, sentry.size, file, 0);
+                break;
+        case MTRRIOC_KILL_ENTRY:
+#ifdef CONFIG_COMPAT
+        case MTRRIOC32_KILL_ENTRY:
+#endif
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                err = mtrr_del(-1, sentry.base, sentry.size);
+                break;
+        case MTRRIOC_GET_ENTRY:
+#ifdef CONFIG_COMPAT
+        case MTRRIOC32_GET_ENTRY:
+#endif
+                if (gentry.regnum >= num_var_ranges)
+                        return -EINVAL;
+                mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+                /* Hide entries that go above 4GB */
+                if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
+                    || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
+                        gentry.base = gentry.size = gentry.type = 0;
+                else {
+                        gentry.base <<= PAGE_SHIFT;
+                        gentry.size = size << PAGE_SHIFT;
+                        gentry.type = type;
+                }
+                break;
+        case MTRRIOC_ADD_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+        case MTRRIOC32_ADD_PAGE_ENTRY:
+#endif
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                err =
+                    mtrr_file_add(sentry.base, sentry.size, sentry.type, 1,
+                                  file, 1);
+                break;
+        case MTRRIOC_SET_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+        case MTRRIOC32_SET_PAGE_ENTRY:
+#endif
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0);
+                break;
+        case MTRRIOC_DEL_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+        case MTRRIOC32_DEL_PAGE_ENTRY:
+#endif
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                err = mtrr_file_del(sentry.base, sentry.size, file, 1);
+                break;
+        case MTRRIOC_KILL_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+        case MTRRIOC32_KILL_PAGE_ENTRY:
+#endif
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                err = mtrr_del_page(-1, sentry.base, sentry.size);
+                break;
+        case MTRRIOC_GET_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+        case MTRRIOC32_GET_PAGE_ENTRY:
+#endif
+                if (gentry.regnum >= num_var_ranges)
+                        return -EINVAL;
+                mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+                /* Hide entries that would overflow */
+                if (size != (__typeof__(gentry.size))size)
+                        gentry.base = gentry.size = gentry.type = 0;
+                else {
+                        gentry.size = size;
+                        gentry.type = type;
+                }
+                break;
+        }
+        if (err)
+                return err;
+        switch(cmd) {
+        case MTRRIOC_GET_ENTRY:
+        case MTRRIOC_GET_PAGE_ENTRY:
+                if (copy_to_user(arg, &gentry, sizeof gentry))
+                        err = -EFAULT;
+                break;
+#ifdef CONFIG_COMPAT
+        case MTRRIOC32_GET_ENTRY:
+        case MTRRIOC32_GET_PAGE_ENTRY: {
+                struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg;
+                err = put_user(gentry.base, &g32->base);
+                err |= put_user(gentry.size, &g32->size);
+                err |= put_user(gentry.regnum, &g32->regnum);
+                err |= put_user(gentry.type, &g32->type);
+                break;
+        }
+#endif
+        }
+        return err;
+}
+static int
+mtrr_close(struct inode *ino, struct file *file)
+{
+        int i, max;
+        unsigned int *fcount = FILE_FCOUNT(file);
+        if (fcount != NULL) {
+                max = num_var_ranges;
+                for (i = 0; i < max; ++i) {
+                        while (fcount[i] > 0) {
+                                mtrr_del(i, 0, 0);
+                                --fcount[i];
+                        }
+                }
+                kfree(fcount);
+                FILE_FCOUNT(file) = NULL;
+        }
+        return single_release(ino, file);
+}
+static int mtrr_seq_show(struct seq_file *seq, void *offset);
+static int mtrr_open(struct inode *inode, struct file *file)
+{
+        if (!mtrr_if) 
+                return -EIO;
+        if (!mtrr_if->get) 
+                return -ENXIO; 
+        return single_open(file, mtrr_seq_show, NULL);
+}
+static const struct file_operations mtrr_fops = {
+        .owner   = THIS_MODULE,
+        .open    = mtrr_open, 
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .write   = mtrr_write,
+        .unlocked_ioctl = mtrr_ioctl,
+        .compat_ioctl = mtrr_ioctl,
+        .release = mtrr_close,
+};
+static struct proc_dir_entry *proc_root_mtrr;
+static int mtrr_seq_show(struct seq_file *seq, void *offset)
+{
+        char factor;
+        int i, max, len;
+        mtrr_type type;
+        unsigned long base, size;
+        len = 0;
+        max = num_var_ranges;
+        for (i = 0; i < max; i++) {
+                mtrr_if->get(i, &base, &size, &type);
+                if (size == 0)
+                        usage_table[i] = 0;
+                else {
+                        if (size < (0x100000 >> PAGE_SHIFT)) {
+                                /* less than 1MB */
+                                factor = 'K';
+                                size <<= PAGE_SHIFT - 10;
+                        } else {
+                                factor = 'M';
+                                size >>= 20 - PAGE_SHIFT;
+                        }
+                        /* RED-PEN: base can be > 32bit */ 
+                        len += seq_printf(seq, 
+                                   "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
+                             i, base, base >> (20 - PAGE_SHIFT), size, factor,
+                             mtrr_attrib_to_str(type), usage_table[i]);
+                }
+        }
+        return 0;
+}
+static int __init mtrr_if_init(void)
+{
+        struct cpuinfo_x86 *c = &boot_cpu_data;
+        if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
+            (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
+            (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
+            (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
+                return -ENODEV;
+        proc_root_mtrr =
+            create_proc_entry("mtrr", S_IWUSR | S_IRUGO, &proc_root);
+        if (proc_root_mtrr) {
+                proc_root_mtrr->owner = THIS_MODULE;
+                proc_root_mtrr->proc_fops = &mtrr_fops;
+        }
+        return 0;
+}
+arch_initcall(mtrr_if_init);
+#endif                  /*  CONFIG_PROC_FS  */
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
new file mode 100644
index 000000000000..c48b6fea5ab4
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -0,0 +1,768 @@
+/*  Generic MTRR (Memory Type Range Register) driver.
+    Copyright (C) 1997-2000  Richard Gooch
+    Copyright (c) 2002       Patrick Mochel
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Library General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Library General Public License for more details.
+    You should have received a copy of the GNU Library General Public
+    License along with this library; if not, write to the Free
+    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+    Richard Gooch may be reached by email at  rgooch@atnf.csiro.au
+    The postal address is:
+      Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
+    Source: "Pentium Pro Family Developer's Manual, Volume 3:
+    Operating System Writer's Guide" (Intel document number 242692),
+    section 11.11.7
+    This was cleaned and made readable by Patrick Mochel <mochel@osdl.org> 
+    on 6-7 March 2002. 
+    Source: Intel Architecture Software Developers Manual, Volume 3: 
+    System Programming Guide; Section 9.11. (1997 edition - PPro).
+*/
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <asm/mtrr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include "mtrr.h"
+u32 num_var_ranges = 0;
+unsigned int *usage_table;
+static DEFINE_MUTEX(mtrr_mutex);
+u64 size_or_mask, size_and_mask;
+static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {};
+struct mtrr_ops * mtrr_if = NULL;
+static void set_mtrr(unsigned int reg, unsigned long base,
+                     unsigned long size, mtrr_type type);
+#ifndef CONFIG_X86_64
+extern int arr3_protected;
+#else
+#define arr3_protected 0
+#endif
+void set_mtrr_ops(struct mtrr_ops * ops)
+{
+        if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
+                mtrr_ops[ops->vendor] = ops;
+}
+/*  Returns non-zero if we have the write-combining memory type  */
+static int have_wrcomb(void)
+{
+        struct pci_dev *dev;
+        u8 rev;
+        
+        if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) {
+                /* ServerWorks LE chipsets < rev 6 have problems with write-combining
+                   Don't allow it and leave room for other chipsets to be tagged */
+                if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
+                    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
+                        pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+                        if (rev <= 5) {
+                                printk(KERN_INFO "mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+                                pci_dev_put(dev);
+                                return 0;
+                        }
+                }
+                /* Intel 450NX errata # 23. Non ascending cacheline evictions to
+                   write combining memory may resulting in data corruption */
+                if (dev->vendor == PCI_VENDOR_ID_INTEL &&
+                    dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
+                        printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n");
+                        pci_dev_put(dev);
+                        return 0;
+                }
+                pci_dev_put(dev);
+        }               
+        return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0);
+}
+/*  This function returns the number of variable MTRRs  */
+static void __init set_num_var_ranges(void)
+{
+        unsigned long config = 0, dummy;
+        if (use_intel()) {
+                rdmsr(MTRRcap_MSR, config, dummy);
+        } else if (is_cpu(AMD))
+                config = 2;
+        else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
+                config = 8;
+        num_var_ranges = config & 0xff;
+}
+static void __init init_table(void)
+{
+        int i, max;
+        max = num_var_ranges;
+        if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
+            == NULL) {
+                printk(KERN_ERR "mtrr: could not allocate\n");
+                return;
+        }
+        for (i = 0; i < max; i++)
+                usage_table[i] = 1;
+}
+struct set_mtrr_data {
+        atomic_t        count;
+        atomic_t        gate;
+        unsigned long   smp_base;
+        unsigned long   smp_size;
+        unsigned int    smp_reg;
+        mtrr_type       smp_type;
+};
+#ifdef CONFIG_SMP
+static void ipi_handler(void *info)
+/*  [SUMMARY] Synchronisation handler. Executed by "other" CPUs.
+    [RETURNS] Nothing.
+*/
+{
+        struct set_mtrr_data *data = info;
+        unsigned long flags;
+        local_irq_save(flags);
+        atomic_dec(&data->count);
+        while(!atomic_read(&data->gate))
+                cpu_relax();
+        /*  The master has cleared me to execute  */
+        if (data->smp_reg != ~0U) 
+                mtrr_if->set(data->smp_reg, data->smp_base, 
+                             data->smp_size, data->smp_type);
+        else
+                mtrr_if->set_all();
+        atomic_dec(&data->count);
+        while(atomic_read(&data->gate))
+                cpu_relax();
+        atomic_dec(&data->count);
+        local_irq_restore(flags);
+}
+#endif
+static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
+        return type1 == MTRR_TYPE_UNCACHABLE ||
+               type2 == MTRR_TYPE_UNCACHABLE ||
+               (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
+               (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
+}
+/**
+ * set_mtrr - update mtrrs on all processors
+ * @reg:        mtrr in question
+ * @base:       mtrr base
+ * @size:       mtrr size
+ * @type:       mtrr type
+ *
+ * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
+ * 
+ * 1. Send IPI to do the following:
+ * 2. Disable Interrupts
+ * 3. Wait for all procs to do so 
+ * 4. Enter no-fill cache mode
+ * 5. Flush caches
+ * 6. Clear PGE bit
+ * 7. Flush all TLBs
+ * 8. Disable all range registers
+ * 9. Update the MTRRs
+ * 10. Enable all range registers
+ * 11. Flush all TLBs and caches again
+ * 12. Enter normal cache mode and reenable caching
+ * 13. Set PGE 
+ * 14. Wait for buddies to catch up
+ * 15. Enable interrupts.
+ * 
+ * What does that mean for us? Well, first we set data.count to the number
+ * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
+ * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
+ * Meanwhile, they are waiting for that flag to be set. Once it's set, each 
+ * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it 
+ * differently, so we call mtrr_if->set() callback and let them take care of it.
+ * When they're done, they again decrement data->count and wait for data.gate to 
+ * be reset. 
+ * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag.
+ * Everyone then enables interrupts and we all continue on.
+ *
+ * Note that the mechanism is the same for UP systems, too; all the SMP stuff
+ * becomes nops.
+ */
+static void set_mtrr(unsigned int reg, unsigned long base,
+                     unsigned long size, mtrr_type type)
+{
+        struct set_mtrr_data data;
+        unsigned long flags;
+        data.smp_reg = reg;
+        data.smp_base = base;
+        data.smp_size = size;
+        data.smp_type = type;
+        atomic_set(&data.count, num_booting_cpus() - 1);
+        /* make sure data.count is visible before unleashing other CPUs */
+        smp_wmb();
+        atomic_set(&data.gate,0);
+        /*  Start the ball rolling on other CPUs  */
+        if (smp_call_function(ipi_handler, &data, 1, 0) != 0)
+                panic("mtrr: timed out waiting for other CPUs\n");
+        local_irq_save(flags);
+        while(atomic_read(&data.count))
+                cpu_relax();
+        /* ok, reset count and toggle gate */
+        atomic_set(&data.count, num_booting_cpus() - 1);
+        smp_wmb();
+        atomic_set(&data.gate,1);
+        /* do our MTRR business */
+        /* HACK!
+         * We use this same function to initialize the mtrrs on boot.
+         * The state of the boot cpu's mtrrs has been saved, and we want
+         * to replicate across all the APs. 
+         * If we're doing that @reg is set to something special...
+         */
+        if (reg != ~0U) 
+                mtrr_if->set(reg,base,size,type);
+        /* wait for the others */
+        while(atomic_read(&data.count))
+                cpu_relax();
+        atomic_set(&data.count, num_booting_cpus() - 1);
+        smp_wmb();
+        atomic_set(&data.gate,0);
+        /*
+         * Wait here for everyone to have seen the gate change
+         * So we're the last ones to touch 'data'
+         */
+        while(atomic_read(&data.count))
+                cpu_relax();
+        local_irq_restore(flags);
+}
+/**
+ *      mtrr_add_page - Add a memory type region
+ *      @base: Physical base address of region in pages (in units of 4 kB!)
+ *      @size: Physical size of region in pages (4 kB)
+ *      @type: Type of MTRR desired
+ *      @increment: If this is true do usage counting on the region
+ *
+ *      Memory type region registers control the caching on newer Intel and
+ *      non Intel processors. This function allows drivers to request an
+ *      MTRR is added. The details and hardware specifics of each processor's
+ *      implementation are hidden from the caller, but nevertheless the 
+ *      caller should expect to need to provide a power of two size on an
+ *      equivalent power of two boundary.
+ *
+ *      If the region cannot be added either because all regions are in use
+ *      or the CPU cannot support it a negative value is returned. On success
+ *      the register number for this entry is returned, but should be treated
+ *      as a cookie only.
+ *
+ *      On a multiprocessor machine the changes are made to all processors.
+ *      This is required on x86 by the Intel processors.
+ *
+ *      The available types are
+ *
+ *      %MTRR_TYPE_UNCACHABLE   -       No caching
+ *
+ *      %MTRR_TYPE_WRBACK       -       Write data back in bursts whenever
+ *
+ *      %MTRR_TYPE_WRCOMB       -       Write data back soon but allow bursts
+ *
+ *      %MTRR_TYPE_WRTHROUGH    -       Cache reads but not writes
+ *
+ *      BUGS: Needs a quiet flag for the cases where drivers do not mind
+ *      failures and do not wish system log messages to be sent.
+ */
+int mtrr_add_page(unsigned long base, unsigned long size, 
+                  unsigned int type, char increment)
+{
+        int i, replace, error;
+        mtrr_type ltype;
+        unsigned long lbase, lsize;
+        if (!mtrr_if)
+                return -ENXIO;
+                
+        if ((error = mtrr_if->validate_add_page(base,size,type)))
+                return error;
+        if (type >= MTRR_NUM_TYPES) {
+                printk(KERN_WARNING "mtrr: type: %u invalid\n", type);
+                return -EINVAL;
+        }
+        /*  If the type is WC, check that this processor supports it  */
+        if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
+                printk(KERN_WARNING
+                       "mtrr: your processor doesn't support write-combining\n");
+                return -ENOSYS;
+        }
+        if (!size) {
+                printk(KERN_WARNING "mtrr: zero sized request\n");
+                return -EINVAL;
+        }
+        if (base & size_or_mask || size & size_or_mask) {
+                printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n");
+                return -EINVAL;
+        }
+        error = -EINVAL;
+        replace = -1;
+        /* No CPU hotplug when we change MTRR entries */
+        lock_cpu_hotplug();
+        /*  Search for existing MTRR  */
+        mutex_lock(&mtrr_mutex);
+        for (i = 0; i < num_var_ranges; ++i) {
+                mtrr_if->get(i, &lbase, &lsize, &ltype);
+                if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase)
+                        continue;
+                /*  At this point we know there is some kind of overlap/enclosure  */
+                if (base < lbase || base + size - 1 > lbase + lsize - 1) {
+                        if (base <= lbase && base + size - 1 >= lbase + lsize - 1) {
+                                /*  New region encloses an existing region  */
+                                if (type == ltype) {
+                                        replace = replace == -1 ? i : -2;
+                                        continue;
+                                }
+                                else if (types_compatible(type, ltype))
+                                        continue;
+                        }
+                        printk(KERN_WARNING
+                               "mtrr: 0x%lx000,0x%lx000 overlaps existing"
+                               " 0x%lx000,0x%lx000\n", base, size, lbase,
+                               lsize);
+                        goto out;
+                }
+                /*  New region is enclosed by an existing region  */
+                if (ltype != type) {
+                        if (types_compatible(type, ltype))
+                                continue;
+                        printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
+                             base, size, mtrr_attrib_to_str(ltype),
+                             mtrr_attrib_to_str(type));
+                        goto out;
+                }
+                if (increment)
+                        ++usage_table[i];
+                error = i;
+                goto out;
+        }
+        /*  Search for an empty MTRR  */
+        i = mtrr_if->get_free_region(base, size, replace);
+        if (i >= 0) {
+                set_mtrr(i, base, size, type);
+                if (likely(replace < 0))
+                        usage_table[i] = 1;
+                else {
+                        usage_table[i] = usage_table[replace] + !!increment;
+                        if (unlikely(replace != i)) {
+                                set_mtrr(replace, 0, 0, 0);
+                                usage_table[replace] = 0;
+                        }
+                }
+        } else
+                printk(KERN_INFO "mtrr: no more MTRRs available\n");
+        error = i;
+ out:
+        mutex_unlock(&mtrr_mutex);
+        unlock_cpu_hotplug();
+        return error;
+}
+static int mtrr_check(unsigned long base, unsigned long size)
+{
+        if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
+                printk(KERN_WARNING
+                        "mtrr: size and base must be multiples of 4 kiB\n");
+                printk(KERN_DEBUG
+                        "mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
+                dump_stack();
+                return -1;
+        }
+        return 0;
+}
+/**
+ *      mtrr_add - Add a memory type region
+ *      @base: Physical base address of region
+ *      @size: Physical size of region
+ *      @type: Type of MTRR desired
+ *      @increment: If this is true do usage counting on the region
+ *
+ *      Memory type region registers control the caching on newer Intel and
+ *      non Intel processors. This function allows drivers to request an
+ *      MTRR is added. The details and hardware specifics of each processor's
+ *      implementation are hidden from the caller, but nevertheless the 
+ *      caller should expect to need to provide a power of two size on an
+ *      equivalent power of two boundary.
+ *
+ *      If the region cannot be added either because all regions are in use
+ *      or the CPU cannot support it a negative value is returned. On success
+ *      the register number for this entry is returned, but should be treated
+ *      as a cookie only.
+ *
+ *      On a multiprocessor machine the changes are made to all processors.
+ *      This is required on x86 by the Intel processors.
+ *
+ *      The available types are
+ *
+ *      %MTRR_TYPE_UNCACHABLE   -       No caching
+ *
+ *      %MTRR_TYPE_WRBACK       -       Write data back in bursts whenever
+ *
+ *      %MTRR_TYPE_WRCOMB       -       Write data back soon but allow bursts
+ *
+ *      %MTRR_TYPE_WRTHROUGH    -       Cache reads but not writes
+ *
+ *      BUGS: Needs a quiet flag for the cases where drivers do not mind
+ *      failures and do not wish system log messages to be sent.
+ */
+int
+mtrr_add(unsigned long base, unsigned long size, unsigned int type,
+         char increment)
+{
+        if (mtrr_check(base, size))
+                return -EINVAL;
+        return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
+                             increment);
+}
+/**
+ *      mtrr_del_page - delete a memory type region
+ *      @reg: Register returned by mtrr_add
+ *      @base: Physical base address
+ *      @size: Size of region
+ *
+ *      If register is supplied then base and size are ignored. This is
+ *      how drivers should call it.
+ *
+ *      Releases an MTRR region. If the usage count drops to zero the 
+ *      register is freed and the region returns to default state.
+ *      On success the register is returned, on failure a negative error
+ *      code.
+ */
+int mtrr_del_page(int reg, unsigned long base, unsigned long size)
+{
+        int i, max;
+        mtrr_type ltype;
+        unsigned long lbase, lsize;
+        int error = -EINVAL;
+        if (!mtrr_if)
+                return -ENXIO;
+        max = num_var_ranges;
+        /* No CPU hotplug when we change MTRR entries */
+        lock_cpu_hotplug();
+        mutex_lock(&mtrr_mutex);
+        if (reg < 0) {
+                /*  Search for existing MTRR  */
+                for (i = 0; i < max; ++i) {
+                        mtrr_if->get(i, &lbase, &lsize, &ltype);
+                        if (lbase == base && lsize == size) {
+                                reg = i;
+                                break;
+                        }
+                }
+                if (reg < 0) {
+                        printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
+                               size);
+                        goto out;
+                }
+        }
+        if (reg >= max) {
+                printk(KERN_WARNING "mtrr: register: %d too big\n", reg);
+                goto out;
+        }
+        if (is_cpu(CYRIX) && !use_intel()) {
+                if ((reg == 3) && arr3_protected) {
+                        printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n");
+                        goto out;
+                }
+        }
+        mtrr_if->get(reg, &lbase, &lsize, &ltype);
+        if (lsize < 1) {
+                printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
+                goto out;
+        }
+        if (usage_table[reg] < 1) {
+                printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
+                goto out;
+        }
+        if (--usage_table[reg] < 1)
+                set_mtrr(reg, 0, 0, 0);
+        error = reg;
+ out:
+        mutex_unlock(&mtrr_mutex);
+        unlock_cpu_hotplug();
+        return error;
+}
+/**
+ *      mtrr_del - delete a memory type region
+ *      @reg: Register returned by mtrr_add
+ *      @base: Physical base address
+ *      @size: Size of region
+ *
+ *      If register is supplied then base and size are ignored. This is
+ *      how drivers should call it.
+ *
+ *      Releases an MTRR region. If the usage count drops to zero the 
+ *      register is freed and the region returns to default state.
+ *      On success the register is returned, on failure a negative error
+ *      code.
+ */
+int
+mtrr_del(int reg, unsigned long base, unsigned long size)
+{
+        if (mtrr_check(base, size))
+                return -EINVAL;
+        return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(mtrr_add);
+EXPORT_SYMBOL(mtrr_del);
+/* HACK ALERT!
+ * These should be called implicitly, but we can't yet until all the initcall
+ * stuff is done...
+ */
+extern void amd_init_mtrr(void);
+extern void cyrix_init_mtrr(void);
+extern void centaur_init_mtrr(void);
+static void __init init_ifs(void)
+{
+#ifndef CONFIG_X86_64
+        amd_init_mtrr();
+        cyrix_init_mtrr();
+        centaur_init_mtrr();
+#endif
+}
+/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
+ * MTRR driver doesn't require this
+ */
+struct mtrr_value {
+        mtrr_type       ltype;
+        unsigned long   lbase;
+        unsigned long   lsize;
+};
+static struct mtrr_value * mtrr_state;
+static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
+{
+        int i;
+        int size = num_var_ranges * sizeof(struct mtrr_value);
+        mtrr_state = kzalloc(size,GFP_ATOMIC);
+        if (!mtrr_state)
+                return -ENOMEM;
+        for (i = 0; i < num_var_ranges; i++) {
+                mtrr_if->get(i,
+                             &mtrr_state[i].lbase,
+                             &mtrr_state[i].lsize,
+                             &mtrr_state[i].ltype);
+        }
+        return 0;
+}
+static int mtrr_restore(struct sys_device * sysdev)
+{
+        int i;
+        for (i = 0; i < num_var_ranges; i++) {
+                if (mtrr_state[i].lsize) 
+                        set_mtrr(i,
+                                 mtrr_state[i].lbase,
+                                 mtrr_state[i].lsize,
+                                 mtrr_state[i].ltype);
+        }
+        kfree(mtrr_state);
+        return 0;
+}
+static struct sysdev_driver mtrr_sysdev_driver = {
+        .suspend        = mtrr_save,
+        .resume         = mtrr_restore,
+};
+/**
+ * mtrr_bp_init - initialize mtrrs on the boot CPU
+ *
+ * This needs to be called early; before any of the other CPUs are 
+ * initialized (i.e. before smp_init()).
+ * 
+ */
+void __init mtrr_bp_init(void)
+{
+        init_ifs();
+        if (cpu_has_mtrr) {
+                mtrr_if = &generic_mtrr_ops;
+                size_or_mask = 0xff000000;      /* 36 bits */
+                size_and_mask = 0x00f00000;
+                /* This is an AMD specific MSR, but we assume(hope?) that
+                   Intel will implement it to when they extend the address
+                   bus of the Xeon. */
+                if (cpuid_eax(0x80000000) >= 0x80000008) {
+                        u32 phys_addr;
+                        phys_addr = cpuid_eax(0x80000008) & 0xff;
+                        /* CPUID workaround for Intel 0F33/0F34 CPU */
+                        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+                            boot_cpu_data.x86 == 0xF &&
+                            boot_cpu_data.x86_model == 0x3 &&
+                            (boot_cpu_data.x86_mask == 0x3 ||
+                             boot_cpu_data.x86_mask == 0x4))
+                                phys_addr = 36;
+                        size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1);
+                        size_and_mask = ~size_or_mask & 0xfffff00000ULL;
+                } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
+                           boot_cpu_data.x86 == 6) {
+                        /* VIA C* family have Intel style MTRRs, but
+                           don't support PAE */
+                        size_or_mask = 0xfff00000;      /* 32 bits */
+                        size_and_mask = 0;
+                }
+        } else {
+                switch (boot_cpu_data.x86_vendor) {
+                case X86_VENDOR_AMD:
+                        if (cpu_has_k6_mtrr) {
+                                /* Pre-Athlon (K6) AMD CPU MTRRs */
+                                mtrr_if = mtrr_ops[X86_VENDOR_AMD];
+                                size_or_mask = 0xfff00000;      /* 32 bits */
+                                size_and_mask = 0;
+                        }
+                        break;
+                case X86_VENDOR_CENTAUR:
+                        if (cpu_has_centaur_mcr) {
+                                mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
+                                size_or_mask = 0xfff00000;      /* 32 bits */
+                                size_and_mask = 0;
+                        }
+                        break;
+                case X86_VENDOR_CYRIX:
+                        if (cpu_has_cyrix_arr) {
+                                mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
+                                size_or_mask = 0xfff00000;      /* 32 bits */
+                                size_and_mask = 0;
+                        }
+                        break;
+                default:
+                        break;
+                }
+        }
+        if (mtrr_if) {
+                set_num_var_ranges();
+                init_table();
+                if (use_intel())
+                        get_mtrr_state();
+        }
+}
+void mtrr_ap_init(void)
+{
+        unsigned long flags;
+        if (!mtrr_if || !use_intel())
+                return;
+        /*
+         * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed,
+         * but this routine will be called in cpu boot time, holding the lock
+         * breaks it. This routine is called in two cases: 1.very earily time
+         * of software resume, when there absolutely isn't mtrr entry changes;
+         * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to
+         * prevent mtrr entry changes
+         */
+        local_irq_save(flags);
+        mtrr_if->set_all();
+        local_irq_restore(flags);
+}
+/**
+ * Save current fixed-range MTRR state of the BSP
+ */
+void mtrr_save_state(void)
+{
+        int cpu = get_cpu();
+        if (cpu == 0)
+                mtrr_save_fixed_ranges(NULL);
+        else
+                smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1);
+        put_cpu();
+}
+static int __init mtrr_init_finialize(void)
+{
+        if (!mtrr_if)
+                return 0;
+        if (use_intel())
+                mtrr_state_warn();
+        else {
+                /* The CPUs haven't MTRR and seemes not support SMP. They have
+                 * specific drivers, we use a tricky method to support
+                 * suspend/resume for them.
+                 * TBD: is there any system with such CPU which supports
+                 * suspend/resume?  if no, we should remove the code.
+                 */
+                sysdev_driver_register(&cpu_sysdev_class,
+                        &mtrr_sysdev_driver);
+        }
+        return 0;
+}
+subsys_initcall(mtrr_init_finialize);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
new file mode 100644
index 000000000000..289dfe6030e3
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -0,0 +1,98 @@
+/*
+ * local mtrr defines.
+ */
+#ifndef TRUE
+#define TRUE  1
+#define FALSE 0
+#endif
+#define MTRRcap_MSR     0x0fe
+#define MTRRdefType_MSR 0x2ff
+#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
+#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
+#define NUM_FIXED_RANGES 88
+#define MTRRfix64K_00000_MSR 0x250
+#define MTRRfix16K_80000_MSR 0x258
+#define MTRRfix16K_A0000_MSR 0x259
+#define MTRRfix4K_C0000_MSR 0x268
+#define MTRRfix4K_C8000_MSR 0x269
+#define MTRRfix4K_D0000_MSR 0x26a
+#define MTRRfix4K_D8000_MSR 0x26b
+#define MTRRfix4K_E0000_MSR 0x26c
+#define MTRRfix4K_E8000_MSR 0x26d
+#define MTRRfix4K_F0000_MSR 0x26e
+#define MTRRfix4K_F8000_MSR 0x26f
+#define MTRR_CHANGE_MASK_FIXED     0x01
+#define MTRR_CHANGE_MASK_VARIABLE  0x02
+#define MTRR_CHANGE_MASK_DEFTYPE   0x04
+/* In the Intel processor's MTRR interface, the MTRR type is always held in
+   an 8 bit field: */
+typedef u8 mtrr_type;
+struct mtrr_ops {
+        u32     vendor;
+        u32     use_intel_if;
+//      void    (*init)(void);
+        void    (*set)(unsigned int reg, unsigned long base,
+                       unsigned long size, mtrr_type type);
+        void    (*set_all)(void);
+        void    (*get)(unsigned int reg, unsigned long *base,
+                       unsigned long *size, mtrr_type * type);
+        int     (*get_free_region)(unsigned long base, unsigned long size,
+                                   int replace_reg);
+        int     (*validate_add_page)(unsigned long base, unsigned long size,
+                                     unsigned int type);
+        int     (*have_wrcomb)(void);
+};
+extern int generic_get_free_region(unsigned long base, unsigned long size,
+                                   int replace_reg);
+extern int generic_validate_add_page(unsigned long base, unsigned long size,
+                                     unsigned int type);
+extern struct mtrr_ops generic_mtrr_ops;
+extern int positive_have_wrcomb(void);
+/* library functions for processor-specific routines */
+struct set_mtrr_context {
+        unsigned long flags;
+        unsigned long cr4val;
+        u32 deftype_lo;
+        u32 deftype_hi;
+        u32 ccr3;
+};
+struct mtrr_var_range {
+        u32 base_lo;
+        u32 base_hi;
+        u32 mask_lo;
+        u32 mask_hi;
+};
+void set_mtrr_done(struct set_mtrr_context *ctxt);
+void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
+void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
+void get_mtrr_state(void);
+extern void set_mtrr_ops(struct mtrr_ops * ops);
+extern u64 size_or_mask, size_and_mask;
+extern struct mtrr_ops * mtrr_if;
+#define is_cpu(vnd)     (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
+#define use_intel()     (mtrr_if && mtrr_if->use_intel_if == 1)
+extern unsigned int num_var_ranges;
+void mtrr_state_warn(void);
+const char *mtrr_attrib_to_str(int x);
+void mtrr_wrmsr(unsigned, unsigned, unsigned);
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
new file mode 100644
index 000000000000..49e20c2afcdf
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -0,0 +1,79 @@
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <asm/io.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+#include <asm/processor-cyrix.h>
+#include "mtrr.h"
+/*  Put the processor into a state where MTRRs can be safely set  */
+void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
+{
+        unsigned int cr0;
+        /*  Disable interrupts locally  */
+        local_irq_save(ctxt->flags);
+        if (use_intel() || is_cpu(CYRIX)) {
+                /*  Save value of CR4 and clear Page Global Enable (bit 7)  */
+                if ( cpu_has_pge ) {
+                        ctxt->cr4val = read_cr4();
+                        write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
+                }
+                /*  Disable and flush caches. Note that wbinvd flushes the TLBs as
+                    a side-effect  */
+                cr0 = read_cr0() | 0x40000000;
+                wbinvd();
+                write_cr0(cr0);
+                wbinvd();
+                if (use_intel())
+                        /*  Save MTRR state */
+                        rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+                else
+                        /* Cyrix ARRs - everything else were excluded at the top */
+                        ctxt->ccr3 = getCx86(CX86_CCR3);
+        }
+}
+void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
+{
+        if (use_intel()) 
+                /*  Disable MTRRs, and set the default type to uncached  */
+                mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL,
+                      ctxt->deftype_hi);
+        else if (is_cpu(CYRIX))
+                /* Cyrix ARRs - everything else were excluded at the top */
+                setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
+}
+/*  Restore the processor after a set_mtrr_prepare  */
+void set_mtrr_done(struct set_mtrr_context *ctxt)
+{
+        if (use_intel() || is_cpu(CYRIX)) {
+                /*  Flush caches and TLBs  */
+                wbinvd();
+                /*  Restore MTRRdefType  */
+                if (use_intel())
+                        /* Intel (P6) standard MTRRs */
+                        mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+                else
+                        /* Cyrix ARRs - everything else was excluded at the top */
+                        setCx86(CX86_CCR3, ctxt->ccr3);
+                
+                /*  Enable caches  */
+                write_cr0(read_cr0() & 0xbfffffff);
+                /*  Restore value of CR4  */
+                if ( cpu_has_pge )
+                        write_cr4(ctxt->cr4val);
+        }
+        /*  Re-enable interrupts locally (if enabled previously)  */
+        local_irq_restore(ctxt->flags);
+}
diff --git a/arch/x86/kernel/cpu/nexgen.c b/arch/x86/kernel/cpu/nexgen.c
new file mode 100644
index 000000000000..961fbe1a748f
--- /dev/null
+++ b/arch/x86/kernel/cpu/nexgen.c
@@ -0,0 +1,60 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <asm/processor.h>
+#include "cpu.h"
+/*
+ *      Detect a NexGen CPU running without BIOS hypercode new enough
+ *      to have CPUID. (Thanks to Herbert Oppmann)
+ */
+ 
+static int __cpuinit deep_magic_nexgen_probe(void)
+{
+        int ret;
+        
+        __asm__ __volatile__ (
+                "       movw    $0x5555, %%ax\n"
+                "       xorw    %%dx,%%dx\n"
+                "       movw    $2, %%cx\n"
+                "       divw    %%cx\n"
+                "       movl    $0, %%eax\n"
+                "       jnz     1f\n"
+                "       movl    $1, %%eax\n"
+                "1:\n" 
+                : "=a" (ret) : : "cx", "dx" );
+        return  ret;
+}
+static void __cpuinit init_nexgen(struct cpuinfo_x86 * c)
+{
+        c->x86_cache_size = 256; /* A few had 1 MB... */
+}
+static void __cpuinit nexgen_identify(struct cpuinfo_x86 * c)
+{
+        /* Detect NexGen with old hypercode */
+        if ( deep_magic_nexgen_probe() ) {
+                strcpy(c->x86_vendor_id, "NexGenDriven");
+        }
+}
+static struct cpu_dev nexgen_cpu_dev __cpuinitdata = {
+        .c_vendor       = "Nexgen",
+        .c_ident        = { "NexGenDriven" },
+        .c_models = {
+                        { .vendor = X86_VENDOR_NEXGEN,
+                          .family = 5,
+                          .model_names = { [1] = "Nx586" }
+                        },
+        },
+        .c_init         = init_nexgen,
+        .c_identify     = nexgen_identify,
+};
+int __init nexgen_init_cpu(void)
+{
+        cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev;
+        return 0;
+}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
new file mode 100644
index 000000000000..93fecd4b03de
--- /dev/null
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -0,0 +1,713 @@
+/* local apic based NMI watchdog for various CPUs.
+   This file also handles reservation of performance counters for coordination
+   with other users (like oprofile).
+   Note that these events normally don't tick when the CPU idles. This means
+   the frequency varies with CPU load.
+   Original code for K7/P6 written by Keith Owens */
+#include <linux/percpu.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <asm/apic.h>
+#include <asm/intel_arch_perfmon.h>
+struct nmi_watchdog_ctlblk {
+        unsigned int cccr_msr;
+        unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
+        unsigned int evntsel_msr;  /* the MSR to select the events to handle */
+};
+/* Interface defining a CPU specific perfctr watchdog */
+struct wd_ops {
+        int (*reserve)(void);
+        void (*unreserve)(void);
+        int (*setup)(unsigned nmi_hz);
+        void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
+        void (*stop)(void);
+        unsigned perfctr;
+        unsigned evntsel;
+        u64 checkbit;
+};
+static struct wd_ops *wd_ops;
+/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
+ */
+#define NMI_MAX_COUNTER_BITS 66
+/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
+ * evtsel_nmi_owner tracks the ownership of the event selection
+ * - different performance counters/ event selection may be reserved for
+ *   different subsystems this reservation system just tries to coordinate
+ *   things a little
+ */
+static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
+static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
+static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
+{
+        /* returns the bit offset of the performance counter register */
+        switch (boot_cpu_data.x86_vendor) {
+        case X86_VENDOR_AMD:
+                return (msr - MSR_K7_PERFCTR0);
+        case X86_VENDOR_INTEL:
+                if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+                        return (msr - MSR_ARCH_PERFMON_PERFCTR0);
+                switch (boot_cpu_data.x86) {
+                case 6:
+                        return (msr - MSR_P6_PERFCTR0);
+                case 15:
+                        return (msr - MSR_P4_BPU_PERFCTR0);
+                }
+        }
+        return 0;
+}
+/* converts an msr to an appropriate reservation bit */
+/* returns the bit offset of the event selection register */
+static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
+{
+        /* returns the bit offset of the event selection register */
+        switch (boot_cpu_data.x86_vendor) {
+        case X86_VENDOR_AMD:
+                return (msr - MSR_K7_EVNTSEL0);
+        case X86_VENDOR_INTEL:
+                if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+                        return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
+                switch (boot_cpu_data.x86) {
+                case 6:
+                        return (msr - MSR_P6_EVNTSEL0);
+                case 15:
+                        return (msr - MSR_P4_BSU_ESCR0);
+                }
+        }
+        return 0;
+}
+/* checks for a bit availability (hack for oprofile) */
+int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
+{
+        BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+        return (!test_bit(counter, perfctr_nmi_owner));
+}
+/* checks the an msr for availability */
+int avail_to_resrv_perfctr_nmi(unsigned int msr)
+{
+        unsigned int counter;
+        counter = nmi_perfctr_msr_to_bit(msr);
+        BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+        return (!test_bit(counter, perfctr_nmi_owner));
+}
+int reserve_perfctr_nmi(unsigned int msr)
+{
+        unsigned int counter;
+        counter = nmi_perfctr_msr_to_bit(msr);
+        BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+        if (!test_and_set_bit(counter, perfctr_nmi_owner))
+                return 1;
+        return 0;
+}
+void release_perfctr_nmi(unsigned int msr)
+{
+        unsigned int counter;
+        counter = nmi_perfctr_msr_to_bit(msr);
+        BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+        clear_bit(counter, perfctr_nmi_owner);
+}
+int reserve_evntsel_nmi(unsigned int msr)
+{
+        unsigned int counter;
+        counter = nmi_evntsel_msr_to_bit(msr);
+        BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+        if (!test_and_set_bit(counter, evntsel_nmi_owner))
+                return 1;
+        return 0;
+}
+void release_evntsel_nmi(unsigned int msr)
+{
+        unsigned int counter;
+        counter = nmi_evntsel_msr_to_bit(msr);
+        BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+        clear_bit(counter, evntsel_nmi_owner);
+}
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
+EXPORT_SYMBOL(reserve_perfctr_nmi);
+EXPORT_SYMBOL(release_perfctr_nmi);
+EXPORT_SYMBOL(reserve_evntsel_nmi);
+EXPORT_SYMBOL(release_evntsel_nmi);
+void disable_lapic_nmi_watchdog(void)
+{
+        BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
+        if (atomic_read(&nmi_active) <= 0)
+                return;
+        on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
+        wd_ops->unreserve();
+        BUG_ON(atomic_read(&nmi_active) != 0);
+}
+void enable_lapic_nmi_watchdog(void)
+{
+        BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
+        /* are we already enabled */
+        if (atomic_read(&nmi_active) != 0)
+                return;
+        /* are we lapic aware */
+        if (!wd_ops)
+                return;
+        if (!wd_ops->reserve()) {
+                printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
+                return;
+        }
+        on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+        touch_nmi_watchdog();
+}
+/*
+ * Activate the NMI watchdog via the local APIC.
+ */
+static unsigned int adjust_for_32bit_ctr(unsigned int hz)
+{
+        u64 counter_val;
+        unsigned int retval = hz;
+        /*
+         * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
+         * are writable, with higher bits sign extending from bit 31.
+         * So, we can only program the counter with 31 bit values and
+         * 32nd bit should be 1, for 33.. to be 1.
+         * Find the appropriate nmi_hz
+         */
+        counter_val = (u64)cpu_khz * 1000;
+        do_div(counter_val, retval);
+        if (counter_val > 0x7fffffffULL) {
+                u64 count = (u64)cpu_khz * 1000;
+                do_div(count, 0x7fffffffUL);
+                retval = count + 1;
+        }
+        return retval;
+}
+static void
+write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz)
+{
+        u64 count = (u64)cpu_khz * 1000;
+        do_div(count, nmi_hz);
+        if(descr)
+                Dprintk("setting %s to -0x%08Lx\n", descr, count);
+        wrmsrl(perfctr_msr, 0 - count);
+}
+static void write_watchdog_counter32(unsigned int perfctr_msr,
+                const char *descr, unsigned nmi_hz)
+{
+        u64 count = (u64)cpu_khz * 1000;
+        do_div(count, nmi_hz);
+        if(descr)
+                Dprintk("setting %s to -0x%08Lx\n", descr, count);
+        wrmsr(perfctr_msr, (u32)(-count), 0);
+}
+/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface
+   nicely stable so there is not much variety */
+#define K7_EVNTSEL_ENABLE       (1 << 22)
+#define K7_EVNTSEL_INT          (1 << 20)
+#define K7_EVNTSEL_OS           (1 << 17)
+#define K7_EVNTSEL_USR          (1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING    0x76
+#define K7_NMI_EVENT            K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+static int setup_k7_watchdog(unsigned nmi_hz)
+{
+        unsigned int perfctr_msr, evntsel_msr;
+        unsigned int evntsel;
+        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+        perfctr_msr = wd_ops->perfctr;
+        evntsel_msr = wd_ops->evntsel;
+        wrmsrl(perfctr_msr, 0UL);
+        evntsel = K7_EVNTSEL_INT
+                | K7_EVNTSEL_OS
+                | K7_EVNTSEL_USR
+                | K7_NMI_EVENT;
+        /* setup the timer */
+        wrmsr(evntsel_msr, evntsel, 0);
+        write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
+        apic_write(APIC_LVTPC, APIC_DM_NMI);
+        evntsel |= K7_EVNTSEL_ENABLE;
+        wrmsr(evntsel_msr, evntsel, 0);
+        wd->perfctr_msr = perfctr_msr;
+        wd->evntsel_msr = evntsel_msr;
+        wd->cccr_msr = 0;  //unused
+        return 1;
+}
+static void single_msr_stop_watchdog(void)
+{
+        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+        wrmsr(wd->evntsel_msr, 0, 0);
+}
+static int single_msr_reserve(void)
+{
+        if (!reserve_perfctr_nmi(wd_ops->perfctr))
+                return 0;
+        if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
+                release_perfctr_nmi(wd_ops->perfctr);
+                return 0;
+        }
+        return 1;
+}
+static void single_msr_unreserve(void)
+{
+        release_evntsel_nmi(wd_ops->evntsel);
+        release_perfctr_nmi(wd_ops->perfctr);
+}
+static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+{
+        /* start the cycle over again */
+        write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
+}
+static struct wd_ops k7_wd_ops = {
+        .reserve = single_msr_reserve,
+        .unreserve = single_msr_unreserve,
+        .setup = setup_k7_watchdog,
+        .rearm = single_msr_rearm,
+        .stop = single_msr_stop_watchdog,
+        .perfctr = MSR_K7_PERFCTR0,
+        .evntsel = MSR_K7_EVNTSEL0,
+        .checkbit = 1ULL<<47,
+};
+/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */
+#define P6_EVNTSEL0_ENABLE      (1 << 22)
+#define P6_EVNTSEL_INT          (1 << 20)
+#define P6_EVNTSEL_OS           (1 << 17)
+#define P6_EVNTSEL_USR          (1 << 16)
+#define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
+#define P6_NMI_EVENT            P6_EVENT_CPU_CLOCKS_NOT_HALTED
+static int setup_p6_watchdog(unsigned nmi_hz)
+{
+        unsigned int perfctr_msr, evntsel_msr;
+        unsigned int evntsel;
+        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+        perfctr_msr = wd_ops->perfctr;
+        evntsel_msr = wd_ops->evntsel;
+        /* KVM doesn't implement this MSR */
+        if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
+                return 0;
+        evntsel = P6_EVNTSEL_INT
+                | P6_EVNTSEL_OS
+                | P6_EVNTSEL_USR
+                | P6_NMI_EVENT;
+        /* setup the timer */
+        wrmsr(evntsel_msr, evntsel, 0);
+        nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+        write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
+        apic_write(APIC_LVTPC, APIC_DM_NMI);
+        evntsel |= P6_EVNTSEL0_ENABLE;
+        wrmsr(evntsel_msr, evntsel, 0);
+        wd->perfctr_msr = perfctr_msr;
+        wd->evntsel_msr = evntsel_msr;
+        wd->cccr_msr = 0;  //unused
+        return 1;
+}
+static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+{
+        /* P6 based Pentium M need to re-unmask
+         * the apic vector but it doesn't hurt
+         * other P6 variant.
+         * ArchPerfom/Core Duo also needs this */
+        apic_write(APIC_LVTPC, APIC_DM_NMI);
+        /* P6/ARCH_PERFMON has 32 bit counter write */
+        write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
+}
+static struct wd_ops p6_wd_ops = {
+        .reserve = single_msr_reserve,
+        .unreserve = single_msr_unreserve,
+        .setup = setup_p6_watchdog,
+        .rearm = p6_rearm,
+        .stop = single_msr_stop_watchdog,
+        .perfctr = MSR_P6_PERFCTR0,
+        .evntsel = MSR_P6_EVNTSEL0,
+        .checkbit = 1ULL<<39,
+};
+/* Intel P4 performance counters. By far the most complicated of all. */
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL   (1<<7)
+#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
+#define P4_ESCR_OS              (1<<3)
+#define P4_ESCR_USR             (1<<2)
+#define P4_CCCR_OVF_PMI0        (1<<26)
+#define P4_CCCR_OVF_PMI1        (1<<27)
+#define P4_CCCR_THRESHOLD(N)    ((N)<<20)
+#define P4_CCCR_COMPLEMENT      (1<<19)
+#define P4_CCCR_COMPARE         (1<<18)
+#define P4_CCCR_REQUIRED        (3<<16)
+#define P4_CCCR_ESCR_SELECT(N)  ((N)<<13)
+#define P4_CCCR_ENABLE          (1<<12)
+#define P4_CCCR_OVF             (1<<31)
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+   CRU_ESCR0 (with any non-null event selector) through a complemented
+   max threshold. [IA32-Vol3, Section 14.9.9] */
+static int setup_p4_watchdog(unsigned nmi_hz)
+{
+        unsigned int perfctr_msr, evntsel_msr, cccr_msr;
+        unsigned int evntsel, cccr_val;
+        unsigned int misc_enable, dummy;
+        unsigned int ht_num;
+        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+        rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
+        if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
+                return 0;
+#ifdef CONFIG_SMP
+        /* detect which hyperthread we are on */
+        if (smp_num_siblings == 2) {
+                unsigned int ebx, apicid;
+                ebx = cpuid_ebx(1);
+                apicid = (ebx >> 24) & 0xff;
+                ht_num = apicid & 1;
+        } else
+#endif
+                ht_num = 0;
+        /* performance counters are shared resources
+         * assign each hyperthread its own set
+         * (re-use the ESCR0 register, seems safe
+         * and keeps the cccr_val the same)
+         */
+        if (!ht_num) {
+                /* logical cpu 0 */
+                perfctr_msr = MSR_P4_IQ_PERFCTR0;
+                evntsel_msr = MSR_P4_CRU_ESCR0;
+                cccr_msr = MSR_P4_IQ_CCCR0;
+                cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
+        } else {
+                /* logical cpu 1 */
+                perfctr_msr = MSR_P4_IQ_PERFCTR1;
+                evntsel_msr = MSR_P4_CRU_ESCR0;
+                cccr_msr = MSR_P4_IQ_CCCR1;
+                cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
+        }
+        evntsel = P4_ESCR_EVENT_SELECT(0x3F)
+                | P4_ESCR_OS
+                | P4_ESCR_USR;
+        cccr_val |= P4_CCCR_THRESHOLD(15)
+                 | P4_CCCR_COMPLEMENT
+                 | P4_CCCR_COMPARE
+                 | P4_CCCR_REQUIRED;
+        wrmsr(evntsel_msr, evntsel, 0);
+        wrmsr(cccr_msr, cccr_val, 0);
+        write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
+        apic_write(APIC_LVTPC, APIC_DM_NMI);
+        cccr_val |= P4_CCCR_ENABLE;
+        wrmsr(cccr_msr, cccr_val, 0);
+        wd->perfctr_msr = perfctr_msr;
+        wd->evntsel_msr = evntsel_msr;
+        wd->cccr_msr = cccr_msr;
+        return 1;
+}
+static void stop_p4_watchdog(void)
+{
+        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+        wrmsr(wd->cccr_msr, 0, 0);
+        wrmsr(wd->evntsel_msr, 0, 0);
+}
+static int p4_reserve(void)
+{
+        if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
+                return 0;
+#ifdef CONFIG_SMP
+        if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
+                goto fail1;
+#endif
+        if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
+                goto fail2;
+        /* RED-PEN why is ESCR1 not reserved here? */
+        return 1;
+ fail2:
+#ifdef CONFIG_SMP
+        if (smp_num_siblings > 1)
+                release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
+ fail1:
+#endif
+        release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
+        return 0;
+}
+static void p4_unreserve(void)
+{
+#ifdef CONFIG_SMP
+        if (smp_num_siblings > 1)
+                release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
+#endif
+        release_evntsel_nmi(MSR_P4_CRU_ESCR0);
+        release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
+}
+static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+{
+        unsigned dummy;
+        /*
+         * P4 quirks:
+         * - An overflown perfctr will assert its interrupt
+         *   until the OVF flag in its CCCR is cleared.
+         * - LVTPC is masked on interrupt and must be
+         *   unmasked by the LVTPC handler.
+         */
+        rdmsrl(wd->cccr_msr, dummy);
+        dummy &= ~P4_CCCR_OVF;
+        wrmsrl(wd->cccr_msr, dummy);
+        apic_write(APIC_LVTPC, APIC_DM_NMI);
+        /* start the cycle over again */
+        write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
+}
+static struct wd_ops p4_wd_ops = {
+        .reserve = p4_reserve,
+        .unreserve = p4_unreserve,
+        .setup = setup_p4_watchdog,
+        .rearm = p4_rearm,
+        .stop = stop_p4_watchdog,
+        /* RED-PEN this is wrong for the other sibling */
+        .perfctr = MSR_P4_BPU_PERFCTR0,
+        .evntsel = MSR_P4_BSU_ESCR0,
+        .checkbit = 1ULL<<39,
+};
+/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully
+   all future Intel CPUs. */
+#define ARCH_PERFMON_NMI_EVENT_SEL      ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
+#define ARCH_PERFMON_NMI_EVENT_UMASK    ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+static int setup_intel_arch_watchdog(unsigned nmi_hz)
+{
+        unsigned int ebx;
+        union cpuid10_eax eax;
+        unsigned int unused;
+        unsigned int perfctr_msr, evntsel_msr;
+        unsigned int evntsel;
+        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+        /*
+         * Check whether the Architectural PerfMon supports
+         * Unhalted Core Cycles Event or not.
+         * NOTE: Corresponding bit = 0 in ebx indicates event present.
+         */
+        cpuid(10, &(eax.full), &ebx, &unused, &unused);
+        if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+            (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+                return 0;
+        perfctr_msr = wd_ops->perfctr;
+        evntsel_msr = wd_ops->evntsel;
+        wrmsrl(perfctr_msr, 0UL);
+        evntsel = ARCH_PERFMON_EVENTSEL_INT
+                | ARCH_PERFMON_EVENTSEL_OS
+                | ARCH_PERFMON_EVENTSEL_USR
+                | ARCH_PERFMON_NMI_EVENT_SEL
+                | ARCH_PERFMON_NMI_EVENT_UMASK;
+        /* setup the timer */
+        wrmsr(evntsel_msr, evntsel, 0);
+        nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+        write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
+        apic_write(APIC_LVTPC, APIC_DM_NMI);
+        evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+        wrmsr(evntsel_msr, evntsel, 0);
+        wd->perfctr_msr = perfctr_msr;
+        wd->evntsel_msr = evntsel_msr;
+        wd->cccr_msr = 0;  //unused
+        wd_ops->checkbit = 1ULL << (eax.split.bit_width - 1);
+        return 1;
+}
+static struct wd_ops intel_arch_wd_ops = {
+        .reserve = single_msr_reserve,
+        .unreserve = single_msr_unreserve,
+        .setup = setup_intel_arch_watchdog,
+        .rearm = p6_rearm,
+        .stop = single_msr_stop_watchdog,
+        .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
+        .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
+};
+static struct wd_ops coreduo_wd_ops = {
+        .reserve = single_msr_reserve,
+        .unreserve = single_msr_unreserve,
+        .setup = setup_intel_arch_watchdog,
+        .rearm = p6_rearm,
+        .stop = single_msr_stop_watchdog,
+        .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+        .evntsel = MSR_ARCH_PERFMON_EVENTSEL0,
+};
+static void probe_nmi_watchdog(void)
+{
+        switch (boot_cpu_data.x86_vendor) {
+        case X86_VENDOR_AMD:
+                if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
+                    boot_cpu_data.x86 != 16)
+                        return;
+                wd_ops = &k7_wd_ops;
+                break;
+        case X86_VENDOR_INTEL:
+                /* Work around Core Duo (Yonah) errata AE49 where perfctr1
+                   doesn't have a working enable bit. */
+                if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) {
+                        wd_ops = &coreduo_wd_ops;
+                        break;
+                }
+                if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+                        wd_ops = &intel_arch_wd_ops;
+                        break;
+                }
+                switch (boot_cpu_data.x86) {
+                case 6:
+                        if (boot_cpu_data.x86_model > 0xd)
+                                return;
+                        wd_ops = &p6_wd_ops;
+                        break;
+                case 15:
+                        if (boot_cpu_data.x86_model > 0x4)
+                                return;
+                        wd_ops = &p4_wd_ops;
+                        break;
+                default:
+                        return;
+                }
+                break;
+        }
+}
+/* Interface to nmi.c */
+int lapic_watchdog_init(unsigned nmi_hz)
+{
+        if (!wd_ops) {
+                probe_nmi_watchdog();
+                if (!wd_ops)
+                        return -1;
+                if (!wd_ops->reserve()) {
+                        printk(KERN_ERR
+                                "NMI watchdog: cannot reserve perfctrs\n");
+                        return -1;
+                }
+        }
+        if (!(wd_ops->setup(nmi_hz))) {
+                printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
+                       raw_smp_processor_id());
+                return -1;
+        }
+        return 0;
+}
+void lapic_watchdog_stop(void)
+{
+        if (wd_ops)
+                wd_ops->stop();
+}
+unsigned lapic_adjust_nmi_hz(unsigned hz)
+{
+        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+        if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
+            wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
+                hz = adjust_for_32bit_ctr(hz);
+        return hz;
+}
+int lapic_wd_event(unsigned nmi_hz)
+{
+        struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+        u64 ctr;
+        rdmsrl(wd->perfctr_msr, ctr);
+        if (ctr & wd_ops->checkbit) { /* perfctr still running? */
+                return 0;
+        }
+        wd_ops->rearm(wd, nmi_hz);
+        return 1;
+}
+int lapic_watchdog_ok(void)
+{
+        return wd_ops != NULL;
+}
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
new file mode 100644
index 000000000000..1e31b6caffb1
--- /dev/null
+++ b/arch/x86/kernel/cpu/proc.c
@@ -0,0 +1,192 @@
+#include <linux/smp.h>
+#include <linux/timex.h>
+#include <linux/string.h>
+#include <asm/semaphore.h>
+#include <linux/seq_file.h>
+#include <linux/cpufreq.h>
+/*
+ *      Get CPU information for use by the procfs.
+ */
+static int show_cpuinfo(struct seq_file *m, void *v)
+{
+        /* 
+         * These flag bits must match the definitions in <asm/cpufeature.h>.
+         * NULL means this bit is undefined or reserved; either way it doesn't
+         * have meaning as far as Linux is concerned.  Note that it's important
+         * to realize there is a difference between this table and CPUID -- if
+         * applications want to get the raw CPUID data, they should access
+         * /dev/cpu/<cpu_nr>/cpuid instead.
+         */
+        static const char * const x86_cap_flags[] = {
+                /* Intel-defined */
+                "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
+                "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
+                "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
+                "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
+                /* AMD-defined */
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
+                NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
+                "3dnowext", "3dnow",
+                /* Transmeta-defined */
+                "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                /* Other (Linux-defined) */
+                "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
+                NULL, NULL, NULL, NULL,
+                "constant_tsc", "up", NULL, "arch_perfmon",
+                "pebs", "bts", NULL, "sync_rdtsc",
+                "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                /* Intel-defined (#2) */
+                "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
+                "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
+                NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                /* VIA/Cyrix/Centaur-defined */
+                NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
+                "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                /* AMD-defined (#2) */
+                "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy",
+                "altmovcr8", "abm", "sse4a",
+                "misalignsse", "3dnowprefetch",
+                "osvw", "ibs", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                /* Auxiliary (Linux-defined) */
+                "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        };
+        static const char * const x86_power_flags[] = {
+                "ts",   /* temperature sensor */
+                "fid",  /* frequency id control */
+                "vid",  /* voltage id control */
+                "ttp",  /* thermal trip */
+                "tm",
+                "stc",
+                "100mhzsteps",
+                "hwpstate",
+                "",     /* constant_tsc - moved to flags */
+                /* nothing */
+        };
+        struct cpuinfo_x86 *c = v;
+        int i, n = c - cpu_data;
+        int fpu_exception;
+#ifdef CONFIG_SMP
+        if (!cpu_online(n))
+                return 0;
+#endif
+        seq_printf(m, "processor\t: %d\n"
+                "vendor_id\t: %s\n"
+                "cpu family\t: %d\n"
+                "model\t\t: %d\n"
+                "model name\t: %s\n",
+                n,
+                c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
+                c->x86,
+                c->x86_model,
+                c->x86_model_id[0] ? c->x86_model_id : "unknown");
+        if (c->x86_mask || c->cpuid_level >= 0)
+                seq_printf(m, "stepping\t: %d\n", c->x86_mask);
+        else
+                seq_printf(m, "stepping\t: unknown\n");
+        if ( cpu_has(c, X86_FEATURE_TSC) ) {
+                unsigned int freq = cpufreq_quick_get(n);
+                if (!freq)
+                        freq = cpu_khz;
+                seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
+                        freq / 1000, (freq % 1000));
+        }
+        /* Cache size */
+        if (c->x86_cache_size >= 0)
+                seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
+#ifdef CONFIG_X86_HT
+        if (c->x86_max_cores * smp_num_siblings > 1) {
+                seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+                seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[n]));
+                seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+                seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+        }
+#endif
+        
+        /* We use exception 16 if we have hardware math and we've either seen it or the CPU claims it is internal */
+        fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
+        seq_printf(m, "fdiv_bug\t: %s\n"
+                        "hlt_bug\t\t: %s\n"
+                        "f00f_bug\t: %s\n"
+                        "coma_bug\t: %s\n"
+                        "fpu\t\t: %s\n"
+                        "fpu_exception\t: %s\n"
+                        "cpuid level\t: %d\n"
+                        "wp\t\t: %s\n"
+                        "flags\t\t:",
+                     c->fdiv_bug ? "yes" : "no",
+                     c->hlt_works_ok ? "no" : "yes",
+                     c->f00f_bug ? "yes" : "no",
+                     c->coma_bug ? "yes" : "no",
+                     c->hard_math ? "yes" : "no",
+                     fpu_exception ? "yes" : "no",
+                     c->cpuid_level,
+                     c->wp_works_ok ? "yes" : "no");
+        for ( i = 0 ; i < 32*NCAPINTS ; i++ )
+                if ( test_bit(i, c->x86_capability) &&
+                     x86_cap_flags[i] != NULL )
+                        seq_printf(m, " %s", x86_cap_flags[i]);
+        for (i = 0; i < 32; i++)
+                if (c->x86_power & (1 << i)) {
+                        if (i < ARRAY_SIZE(x86_power_flags) &&
+                            x86_power_flags[i])
+                                seq_printf(m, "%s%s",
+                                           x86_power_flags[i][0]?" ":"",
+                                           x86_power_flags[i]);
+                        else
+                                seq_printf(m, " [%d]", i);
+                }
+        seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+                     c->loops_per_jiffy/(500000/HZ),
+                     (c->loops_per_jiffy/(5000/HZ)) % 100);
+        seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size);
+        return 0;
+}
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+        return *pos < NR_CPUS ? cpu_data + *pos : NULL;
+}
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        ++*pos;
+        return c_start(m, pos);
+}
+static void c_stop(struct seq_file *m, void *v)
+{
+}
+struct seq_operations cpuinfo_op = {
+        .start  = c_start,
+        .next   = c_next,
+        .stop   = c_stop,
+        .show   = show_cpuinfo,
+};
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
new file mode 100644
index 000000000000..200fb3f9ebfb
--- /dev/null
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -0,0 +1,116 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include "cpu.h"
+static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
+{
+        unsigned int cap_mask, uk, max, dummy;
+        unsigned int cms_rev1, cms_rev2;
+        unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
+        char cpu_info[65];
+        get_model_name(c);      /* Same as AMD/Cyrix */
+        display_cacheinfo(c);
+        /* Print CMS and CPU revision */
+        max = cpuid_eax(0x80860000);
+        cpu_rev = 0;
+        if ( max >= 0x80860001 ) {
+                cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags); 
+                if (cpu_rev != 0x02000000) {
+                        printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
+                                (cpu_rev >> 24) & 0xff,
+                                (cpu_rev >> 16) & 0xff,
+                                (cpu_rev >> 8) & 0xff,
+                                cpu_rev & 0xff,
+                                cpu_freq);
+                }
+        }
+        if ( max >= 0x80860002 ) {
+                cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
+                if (cpu_rev == 0x02000000) {
+                        printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
+                                new_cpu_rev, cpu_freq);
+                }
+                printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
+                       (cms_rev1 >> 24) & 0xff,
+                       (cms_rev1 >> 16) & 0xff,
+                       (cms_rev1 >> 8) & 0xff,
+                       cms_rev1 & 0xff,
+                       cms_rev2);
+        }
+        if ( max >= 0x80860006 ) {
+                cpuid(0x80860003,
+                      (void *)&cpu_info[0],
+                      (void *)&cpu_info[4],
+                      (void *)&cpu_info[8],
+                      (void *)&cpu_info[12]);
+                cpuid(0x80860004,
+                      (void *)&cpu_info[16],
+                      (void *)&cpu_info[20],
+                      (void *)&cpu_info[24],
+                      (void *)&cpu_info[28]);
+                cpuid(0x80860005,
+                      (void *)&cpu_info[32],
+                      (void *)&cpu_info[36],
+                      (void *)&cpu_info[40],
+                      (void *)&cpu_info[44]);
+                cpuid(0x80860006,
+                      (void *)&cpu_info[48],
+                      (void *)&cpu_info[52],
+                      (void *)&cpu_info[56],
+                      (void *)&cpu_info[60]);
+                cpu_info[64] = '\0';
+                printk(KERN_INFO "CPU: %s\n", cpu_info);
+        }
+        /* Unhide possibly hidden capability flags */
+        rdmsr(0x80860004, cap_mask, uk);
+        wrmsr(0x80860004, ~0, uk);
+        c->x86_capability[0] = cpuid_edx(0x00000001);
+        wrmsr(0x80860004, cap_mask, uk);
+        /* All Transmeta CPUs have a constant TSC */
+        set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
+        
+        /* If we can run i686 user-space code, call us an i686 */
+#define USER686 ((1 << X86_FEATURE_TSC)|\
+                 (1 << X86_FEATURE_CX8)|\
+                 (1 << X86_FEATURE_CMOV))
+        if (c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686)
+                c->x86 = 6;
+#ifdef CONFIG_SYSCTL
+        /* randomize_va_space slows us down enormously;
+           it probably triggers retranslation of x86->native bytecode */
+        randomize_va_space = 0;
+#endif
+}
+static void __cpuinit transmeta_identify(struct cpuinfo_x86 * c)
+{
+        u32 xlvl;
+        /* Transmeta-defined flags: level 0x80860001 */
+        xlvl = cpuid_eax(0x80860000);
+        if ( (xlvl & 0xffff0000) == 0x80860000 ) {
+                if (  xlvl >= 0x80860001 )
+                        c->x86_capability[2] = cpuid_edx(0x80860001);
+        }
+}
+static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
+        .c_vendor       = "Transmeta",
+        .c_ident        = { "GenuineTMx86", "TransmetaCPU" },
+        .c_init         = init_transmeta,
+        .c_identify     = transmeta_identify,
+};
+int __init transmeta_init_cpu(void)
+{
+        cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev;
+        return 0;
+}
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
new file mode 100644
index 000000000000..a7a4e75bdcd7
--- /dev/null
+++ b/arch/x86/kernel/cpu/umc.c
@@ -0,0 +1,26 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <asm/processor.h>
+#include "cpu.h"
+/* UMC chips appear to be only either 386 or 486, so no special init takes place.
+ */
+static struct cpu_dev umc_cpu_dev __cpuinitdata = {
+        .c_vendor       = "UMC",
+        .c_ident        = { "UMC UMC UMC" },
+        .c_models = {
+                { .vendor = X86_VENDOR_UMC, .family = 4, .model_names =
+                  { 
+                          [1] = "U5D", 
+                          [2] = "U5S", 
+                  }
+                },
+        },
+};
+int __init umc_init_cpu(void)
+{
+        cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev;
+        return 0;
+}
diff --git a/arch/x86/kernel/cpufreq/Kconfig b/arch/x86/kernel/cpufreq/Kconfig
new file mode 100644
index 000000000000..a3fd51926cbd
--- /dev/null
+++ b/arch/x86/kernel/cpufreq/Kconfig
@@ -0,0 +1,108 @@
+#
+# CPU Frequency scaling
+#
+menu "CPU Frequency scaling"
+source "drivers/cpufreq/Kconfig"
+if CPU_FREQ
+comment "CPUFreq processor drivers"
+config X86_POWERNOW_K8
+        tristate "AMD Opteron/Athlon64 PowerNow!"
+        select CPU_FREQ_TABLE
+        help
+          This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors.
+          To compile this driver as a module, choose M here: the
+          module will be called powernow-k8.
+          For details, take a look at <file:Documentation/cpu-freq/>. 
+          If in doubt, say N.
+config X86_POWERNOW_K8_ACPI
+        bool
+        depends on X86_POWERNOW_K8 && ACPI_PROCESSOR
+        depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m)
+        default y
+config X86_SPEEDSTEP_CENTRINO
+        tristate "Intel Enhanced SpeedStep (deprecated)"
+        select CPU_FREQ_TABLE
+        depends on ACPI_PROCESSOR
+        help
+          This is deprecated and this functionality is now merged into
+          acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
+          speedstep_centrino.
+          This adds the CPUFreq driver for Enhanced SpeedStep enabled
+          mobile CPUs.  This means Intel Pentium M (Centrino) CPUs
+          or 64bit enabled Intel Xeons.
+          To compile this driver as a module, choose M here: the
+          module will be called speedstep-centrino.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config X86_ACPI_CPUFREQ
+        tristate "ACPI Processor P-States driver"
+        select CPU_FREQ_TABLE
+        depends on ACPI_PROCESSOR
+        help
+          This driver adds a CPUFreq driver which utilizes the ACPI
+          Processor Performance States.
+          This driver also supports Intel Enhanced Speedstep.
+          To compile this driver as a module, choose M here: the
+          module will be called acpi-cpufreq.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+comment "shared options"
+config X86_ACPI_CPUFREQ_PROC_INTF
+        bool "/proc/acpi/processor/../performance interface (deprecated)"
+        depends on PROC_FS
+        depends on X86_ACPI_CPUFREQ || X86_POWERNOW_K8_ACPI
+        help
+          This enables the deprecated /proc/acpi/processor/../performance
+          interface. While it is helpful for debugging, the generic,
+          cross-architecture cpufreq interfaces should be used.
+          If in doubt, say N.
+config X86_P4_CLOCKMOD
+        tristate "Intel Pentium 4 clock modulation"
+        depends on EMBEDDED
+        select CPU_FREQ_TABLE
+        help
+          This adds the clock modulation driver for Intel Pentium 4 / XEON
+          processors.  When enabled it will lower CPU temperature by skipping
+          clocks.
+          This driver should be only used in exceptional
+          circumstances when very low power is needed because it causes severe
+          slowdowns and noticeable latencies.  Normally Speedstep should be used
+          instead.
+          To compile this driver as a module, choose M here: the
+          module will be called p4-clockmod.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          Unless you are absolutely sure say N.
+config X86_SPEEDSTEP_LIB
+        tristate
+        default X86_P4_CLOCKMOD
+endif
+endmenu
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
new file mode 100644
index 000000000000..5c2faa10e9fa
--- /dev/null
+++ b/arch/x86/kernel/cpuid.c
@@ -0,0 +1,242 @@
+/* ----------------------------------------------------------------------- *
+ *   
+ *   Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * cpuid.c
+ *
+ * x86 CPUID access device
+ *
+ * This device is accessed by lseek() to the appropriate CPUID level
+ * and then read in chunks of 16 bytes.  A larger size means multiple
+ * reads of consecutive levels.
+ *
+ * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on
+ * an SMP box will direct the access to CPU %d.
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/smp.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+static struct class *cpuid_class;
+#ifdef CONFIG_SMP
+struct cpuid_command {
+        u32 reg;
+        u32 *data;
+};
+static void cpuid_smp_cpuid(void *cmd_block)
+{
+        struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
+        cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
+                      &cmd->data[3]);
+}
+static inline void do_cpuid(int cpu, u32 reg, u32 * data)
+{
+        struct cpuid_command cmd;
+        preempt_disable();
+        if (cpu == smp_processor_id()) {
+                cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
+        } else {
+                cmd.reg = reg;
+                cmd.data = data;
+                smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
+        }
+        preempt_enable();
+}
+#else                           /* ! CONFIG_SMP */
+static inline void do_cpuid(int cpu, u32 reg, u32 * data)
+{
+        cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
+}
+#endif                          /* ! CONFIG_SMP */
+static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
+{
+        loff_t ret;
+        lock_kernel();
+        switch (orig) {
+        case 0:
+                file->f_pos = offset;
+                ret = file->f_pos;
+                break;
+        case 1:
+                file->f_pos += offset;
+                ret = file->f_pos;
+                break;
+        default:
+                ret = -EINVAL;
+        }
+        unlock_kernel();
+        return ret;
+}
+static ssize_t cpuid_read(struct file *file, char __user *buf,
+                          size_t count, loff_t * ppos)
+{
+        char __user *tmp = buf;
+        u32 data[4];
+        u32 reg = *ppos;
+        int cpu = iminor(file->f_path.dentry->d_inode);
+        if (count % 16)
+                return -EINVAL; /* Invalid chunk size */
+        for (; count; count -= 16) {
+                do_cpuid(cpu, reg, data);
+                if (copy_to_user(tmp, &data, 16))
+                        return -EFAULT;
+                tmp += 16;
+                *ppos = reg++;
+        }
+        return tmp - buf;
+}
+static int cpuid_open(struct inode *inode, struct file *file)
+{
+        unsigned int cpu = iminor(file->f_path.dentry->d_inode);
+        struct cpuinfo_x86 *c = &(cpu_data)[cpu];
+        if (cpu >= NR_CPUS || !cpu_online(cpu))
+                return -ENXIO;  /* No such CPU */
+        if (c->cpuid_level < 0)
+                return -EIO;    /* CPUID not supported */
+        return 0;
+}
+/*
+ * File operations we support
+ */
+static const struct file_operations cpuid_fops = {
+        .owner = THIS_MODULE,
+        .llseek = cpuid_seek,
+        .read = cpuid_read,
+        .open = cpuid_open,
+};
+static int cpuid_device_create(int i)
+{
+        int err = 0;
+        struct device *dev;
+        dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i);
+        if (IS_ERR(dev))
+                err = PTR_ERR(dev);
+        return err;
+}
+static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+        unsigned int cpu = (unsigned long)hcpu;
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                cpuid_device_create(cpu);
+                break;
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
+{
+        .notifier_call = cpuid_class_cpu_callback,
+};
+static int __init cpuid_init(void)
+{
+        int i, err = 0;
+        i = 0;
+        if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) {
+                printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
+                       CPUID_MAJOR);
+                err = -EBUSY;
+                goto out;
+        }
+        cpuid_class = class_create(THIS_MODULE, "cpuid");
+        if (IS_ERR(cpuid_class)) {
+                err = PTR_ERR(cpuid_class);
+                goto out_chrdev;
+        }
+        for_each_online_cpu(i) {
+                err = cpuid_device_create(i);
+                if (err != 0) 
+                        goto out_class;
+        }
+        register_hotcpu_notifier(&cpuid_class_cpu_notifier);
+        err = 0;
+        goto out;
+out_class:
+        i = 0;
+        for_each_online_cpu(i) {
+                device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i));
+        }
+        class_destroy(cpuid_class);
+out_chrdev:
+        unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");    
+out:
+        return err;
+}
+static void __exit cpuid_exit(void)
+{
+        int cpu = 0;
+        for_each_online_cpu(cpu)
+                device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+        class_destroy(cpuid_class);
+        unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
+        unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
+}
+module_init(cpuid_init);
+module_exit(cpuid_exit);
+MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
+MODULE_DESCRIPTION("x86 generic CPUID driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/crash_32.c b/arch/x86/kernel/crash_32.c
new file mode 100644
index 000000000000..53589d1b1a05
--- /dev/null
+++ b/arch/x86/kernel/crash_32.c
@@ -0,0 +1,137 @@
+/*
+ * Architecture specific (i386) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ *
+ */
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <linux/kdebug.h>
+#include <asm/smp.h>
+#include <mach_ipi.h>
+/* This keeps a track of which one is crashing cpu. */
+static int crashing_cpu;
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
+static atomic_t waiting_for_crash_ipi;
+static int crash_nmi_callback(struct notifier_block *self,
+                        unsigned long val, void *data)
+{
+        struct pt_regs *regs;
+        struct pt_regs fixed_regs;
+        int cpu;
+        if (val != DIE_NMI_IPI)
+                return NOTIFY_OK;
+        regs = ((struct die_args *)data)->regs;
+        cpu = raw_smp_processor_id();
+        /* Don't do anything if this handler is invoked on crashing cpu.
+         * Otherwise, system will completely hang. Crashing cpu can get
+         * an NMI if system was initially booted with nmi_watchdog parameter.
+         */
+        if (cpu == crashing_cpu)
+                return NOTIFY_STOP;
+        local_irq_disable();
+        if (!user_mode_vm(regs)) {
+                crash_fixup_ss_esp(&fixed_regs, regs);
+                regs = &fixed_regs;
+        }
+        crash_save_cpu(regs, cpu);
+        disable_local_APIC();
+        atomic_dec(&waiting_for_crash_ipi);
+        /* Assume hlt works */
+        halt();
+        for (;;)
+                cpu_relax();
+        return 1;
+}
+static void smp_send_nmi_allbutself(void)
+{
+        cpumask_t mask = cpu_online_map;
+        cpu_clear(safe_smp_processor_id(), mask);
+        if (!cpus_empty(mask))
+                send_IPI_mask(mask, NMI_VECTOR);
+}
+static struct notifier_block crash_nmi_nb = {
+        .notifier_call = crash_nmi_callback,
+};
+static void nmi_shootdown_cpus(void)
+{
+        unsigned long msecs;
+        atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+        /* Would it be better to replace the trap vector here? */
+        if (register_die_notifier(&crash_nmi_nb))
+                return;         /* return what? */
+        /* Ensure the new callback function is set before sending
+         * out the NMI
+         */
+        wmb();
+        smp_send_nmi_allbutself();
+        msecs = 1000; /* Wait at most a second for the other cpus to stop */
+        while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+                mdelay(1);
+                msecs--;
+        }
+        /* Leave the nmi callback set */
+        disable_local_APIC();
+}
+#else
+static void nmi_shootdown_cpus(void)
+{
+        /* There are no cpus to shootdown */
+}
+#endif
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+        /* This function is only called after the system
+         * has panicked or is otherwise in a critical state.
+         * The minimum amount of code to allow a kexec'd kernel
+         * to run successfully needs to happen here.
+         *
+         * In practice this means shooting down the other cpus in
+         * an SMP system.
+         */
+        /* The kernel is broken so disable interrupts */
+        local_irq_disable();
+        /* Make a note of crashing cpu. Will be used in NMI callback.*/
+        crashing_cpu = safe_smp_processor_id();
+        nmi_shootdown_cpus();
+        lapic_shutdown();
+#if defined(CONFIG_X86_IO_APIC)
+        disable_IO_APIC();
+#endif
+        crash_save_cpu(regs, safe_smp_processor_id());
+}
diff --git a/arch/x86/kernel/crash_64.c b/arch/x86/kernel/crash_64.c
new file mode 100644
index 000000000000..13432a1ae904
--- /dev/null
+++ b/arch/x86/kernel/crash_64.c
@@ -0,0 +1,135 @@
+/*
+ * Architecture specific (x86_64) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ *
+ */
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/kdebug.h>
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+#include <asm/mach_apic.h>
+/* This keeps a track of which one is crashing cpu. */
+static int crashing_cpu;
+#ifdef CONFIG_SMP
+static atomic_t waiting_for_crash_ipi;
+static int crash_nmi_callback(struct notifier_block *self,
+                                unsigned long val, void *data)
+{
+        struct pt_regs *regs;
+        int cpu;
+        if (val != DIE_NMI_IPI)
+                return NOTIFY_OK;
+        regs = ((struct die_args *)data)->regs;
+        cpu = raw_smp_processor_id();
+        /*
+         * Don't do anything if this handler is invoked on crashing cpu.
+         * Otherwise, system will completely hang. Crashing cpu can get
+         * an NMI if system was initially booted with nmi_watchdog parameter.
+         */
+        if (cpu == crashing_cpu)
+                return NOTIFY_STOP;
+        local_irq_disable();
+        crash_save_cpu(regs, cpu);
+        disable_local_APIC();
+        atomic_dec(&waiting_for_crash_ipi);
+        /* Assume hlt works */
+        for(;;)
+                halt();
+        return 1;
+}
+static void smp_send_nmi_allbutself(void)
+{
+        send_IPI_allbutself(NMI_VECTOR);
+}
+/*
+ * This code is a best effort heuristic to get the
+ * other cpus to stop executing. So races with
+ * cpu hotplug shouldn't matter.
+ */
+static struct notifier_block crash_nmi_nb = {
+        .notifier_call = crash_nmi_callback,
+};
+static void nmi_shootdown_cpus(void)
+{
+        unsigned long msecs;
+        atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+        if (register_die_notifier(&crash_nmi_nb))
+                return;         /* return what? */
+        /*
+         * Ensure the new callback function is set before sending
+         * out the NMI
+         */
+        wmb();
+        smp_send_nmi_allbutself();
+        msecs = 1000; /* Wait at most a second for the other cpus to stop */
+        while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+                mdelay(1);
+                msecs--;
+        }
+        /* Leave the nmi callback set */
+        disable_local_APIC();
+}
+#else
+static void nmi_shootdown_cpus(void)
+{
+        /* There are no cpus to shootdown */
+}
+#endif
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+        /*
+         * This function is only called after the system
+         * has panicked or is otherwise in a critical state.
+         * The minimum amount of code to allow a kexec'd kernel
+         * to run successfully needs to happen here.
+         *
+         * In practice this means shooting down the other cpus in
+         * an SMP system.
+         */
+        /* The kernel is broken so disable interrupts */
+        local_irq_disable();
+        /* Make a note of crashing cpu. Will be used in NMI callback.*/
+        crashing_cpu = smp_processor_id();
+        nmi_shootdown_cpus();
+        if(cpu_has_apic)
+                 disable_local_APIC();
+        disable_IO_APIC();
+        crash_save_cpu(regs, smp_processor_id());
+}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
new file mode 100644
index 000000000000..3f532df488bc
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -0,0 +1,74 @@
+/*
+ *      kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ *      Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *      Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/crash_dump.h>
+#include <asm/uaccess.h>
+static void *kdump_buf_page;
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ *      space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ *      otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ *
+ * Calling copy_to_user() in atomic context is not desirable. Hence first
+ * copying the data to a pre-allocated kernel page and then copying to user
+ * space in non-atomic context.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+                               size_t csize, unsigned long offset, int userbuf)
+{
+        void  *vaddr;
+        if (!csize)
+                return 0;
+        vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+        if (!userbuf) {
+                memcpy(buf, (vaddr + offset), csize);
+                kunmap_atomic(vaddr, KM_PTE0);
+        } else {
+                if (!kdump_buf_page) {
+                        printk(KERN_WARNING "Kdump: Kdump buffer page not"
+                                " allocated\n");
+                        return -EFAULT;
+                }
+                copy_page(kdump_buf_page, vaddr);
+                kunmap_atomic(vaddr, KM_PTE0);
+                if (copy_to_user(buf, (kdump_buf_page + offset), csize))
+                        return -EFAULT;
+        }
+        return csize;
+}
+static int __init kdump_buf_page_init(void)
+{
+        int ret = 0;
+        kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!kdump_buf_page) {
+                printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
+                         " page\n");
+                ret = -ENOMEM;
+        }
+        return ret;
+}
+arch_initcall(kdump_buf_page_init);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
new file mode 100644
index 000000000000..942deac4d43a
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -0,0 +1,47 @@
+/*
+ *      kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ *      Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *      Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+#include <linux/errno.h>
+#include <linux/crash_dump.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ *      space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ *      otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+                               size_t csize, unsigned long offset, int userbuf)
+{
+        void  *vaddr;
+        if (!csize)
+                return 0;
+        vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+        if (userbuf) {
+                if (copy_to_user(buf, (vaddr + offset), csize)) {
+                        iounmap(vaddr);
+                        return -EFAULT;
+                }
+        } else
+        memcpy(buf, (vaddr + offset), csize);
+        iounmap(vaddr);
+        return csize;
+}
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
new file mode 100644
index 000000000000..40978af630e7
--- /dev/null
+++ b/arch/x86/kernel/doublefault_32.c
@@ -0,0 +1,70 @@
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#define DOUBLEFAULT_STACKSIZE (1024)
+static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
+#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+static void doublefault_fn(void)
+{
+        struct Xgt_desc_struct gdt_desc = {0, 0};
+        unsigned long gdt, tss;
+        store_gdt(&gdt_desc);
+        gdt = gdt_desc.address;
+        printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+        if (ptr_ok(gdt)) {
+                gdt += GDT_ENTRY_TSS << 3;
+                tss = *(u16 *)(gdt+2);
+                tss += *(u8 *)(gdt+4) << 16;
+                tss += *(u8 *)(gdt+7) << 24;
+                printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
+                if (ptr_ok(tss)) {
+                        struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
+                        printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp);
+                        printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
+                                t->eax, t->ebx, t->ecx, t->edx);
+                        printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
+                                t->esi, t->edi);
+                }
+        }
+        for (;;)
+                cpu_relax();
+}
+struct tss_struct doublefault_tss __cacheline_aligned = {
+        .x86_tss = {
+                .esp0           = STACK_START,
+                .ss0            = __KERNEL_DS,
+                .ldt            = 0,
+                .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+                .eip            = (unsigned long) doublefault_fn,
+                /* 0x2 bit is always set */
+                .eflags         = X86_EFLAGS_SF | 0x2,
+                .esp            = STACK_START,
+                .es             = __USER_DS,
+                .cs             = __KERNEL_CS,
+                .ss             = __KERNEL_DS,
+                .ds             = __USER_DS,
+                .fs             = __KERNEL_PERCPU,
+                .__cr3          = __pa(swapper_pg_dir)
+        }
+};
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
new file mode 100644
index 000000000000..3c86b979a40a
--- /dev/null
+++ b/arch/x86/kernel/e820_32.c
@@ -0,0 +1,944 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/pfn.h>
+#include <linux/uaccess.h>
+#include <linux/suspend.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#ifdef CONFIG_EFI
+int efi_enabled = 0;
+EXPORT_SYMBOL(efi_enabled);
+#endif
+struct e820map e820;
+struct change_member {
+        struct e820entry *pbios; /* pointer to original bios entry */
+        unsigned long long addr; /* address for this change point */
+};
+static struct change_member change_point_list[2*E820MAX] __initdata;
+static struct change_member *change_point[2*E820MAX] __initdata;
+static struct e820entry *overlap_list[E820MAX] __initdata;
+static struct e820entry new_bios[E820MAX] __initdata;
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0x10000000;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+extern int user_defined_memmap;
+struct resource data_resource = {
+        .name   = "Kernel data",
+        .start  = 0,
+        .end    = 0,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+struct resource code_resource = {
+        .name   = "Kernel code",
+        .start  = 0,
+        .end    = 0,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+static struct resource system_rom_resource = {
+        .name   = "System ROM",
+        .start  = 0xf0000,
+        .end    = 0xfffff,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+static struct resource extension_rom_resource = {
+        .name   = "Extension ROM",
+        .start  = 0xe0000,
+        .end    = 0xeffff,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+static struct resource adapter_rom_resources[] = { {
+        .name   = "Adapter ROM",
+        .start  = 0xc8000,
+        .end    = 0,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+        .name   = "Adapter ROM",
+        .start  = 0,
+        .end    = 0,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+        .name   = "Adapter ROM",
+        .start  = 0,
+        .end    = 0,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+        .name   = "Adapter ROM",
+        .start  = 0,
+        .end    = 0,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+        .name   = "Adapter ROM",
+        .start  = 0,
+        .end    = 0,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+        .name   = "Adapter ROM",
+        .start  = 0,
+        .end    = 0,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+static struct resource video_rom_resource = {
+        .name   = "Video ROM",
+        .start  = 0xc0000,
+        .end    = 0xc7fff,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+static struct resource video_ram_resource = {
+        .name   = "Video RAM area",
+        .start  = 0xa0000,
+        .end    = 0xbffff,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+static struct resource standard_io_resources[] = { {
+        .name   = "dma1",
+        .start  = 0x0000,
+        .end    = 0x001f,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+        .name   = "pic1",
+        .start  = 0x0020,
+        .end    = 0x0021,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+        .name   = "timer0",
+        .start  = 0x0040,
+        .end    = 0x0043,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+        .name   = "timer1",
+        .start  = 0x0050,
+        .end    = 0x0053,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+        .name   = "keyboard",
+        .start  = 0x0060,
+        .end    = 0x006f,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+        .name   = "dma page reg",
+        .start  = 0x0080,
+        .end    = 0x008f,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+        .name   = "pic2",
+        .start  = 0x00a0,
+        .end    = 0x00a1,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+        .name   = "dma2",
+        .start  = 0x00c0,
+        .end    = 0x00df,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+        .name   = "fpu",
+        .start  = 0x00f0,
+        .end    = 0x00ff,
+        .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+} };
+#define ROMSIGNATURE 0xaa55
+static int __init romsignature(const unsigned char *rom)
+{
+        const unsigned short * const ptr = (const unsigned short *)rom;
+        unsigned short sig;
+        return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+}
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
+{
+        unsigned char sum, c;
+        for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+                sum += c;
+        return !length && !sum;
+}
+static void __init probe_roms(void)
+{
+        const unsigned char *rom;
+        unsigned long start, length, upper;
+        unsigned char c;
+        int i;
+        /* video rom */
+        upper = adapter_rom_resources[0].start;
+        for (start = video_rom_resource.start; start < upper; start += 2048) {
+                rom = isa_bus_to_virt(start);
+                if (!romsignature(rom))
+                        continue;
+                video_rom_resource.start = start;
+                if (probe_kernel_address(rom + 2, c) != 0)
+                        continue;
+                /* 0 < length <= 0x7f * 512, historically */
+                length = c * 512;
+                /* if checksum okay, trust length byte */
+                if (length && romchecksum(rom, length))
+                        video_rom_resource.end = start + length - 1;
+                request_resource(&iomem_resource, &video_rom_resource);
+                break;
+        }
+        start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+        if (start < upper)
+                start = upper;
+        /* system rom */
+        request_resource(&iomem_resource, &system_rom_resource);
+        upper = system_rom_resource.start;
+        /* check for extension rom (ignore length byte!) */
+        rom = isa_bus_to_virt(extension_rom_resource.start);
+        if (romsignature(rom)) {
+                length = extension_rom_resource.end - extension_rom_resource.start + 1;
+                if (romchecksum(rom, length)) {
+                        request_resource(&iomem_resource, &extension_rom_resource);
+                        upper = extension_rom_resource.start;
+                }
+        }
+        /* check for adapter roms on 2k boundaries */
+        for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+                rom = isa_bus_to_virt(start);
+                if (!romsignature(rom))
+                        continue;
+                if (probe_kernel_address(rom + 2, c) != 0)
+                        continue;
+                /* 0 < length <= 0x7f * 512, historically */
+                length = c * 512;
+                /* but accept any length that fits if checksum okay */
+                if (!length || start + length > upper || !romchecksum(rom, length))
+                        continue;
+                adapter_rom_resources[i].start = start;
+                adapter_rom_resources[i].end = start + length - 1;
+                request_resource(&iomem_resource, &adapter_rom_resources[i]);
+                start = adapter_rom_resources[i++].end & ~2047UL;
+        }
+}
+/*
+ * Request address space for all standard RAM and ROM resources
+ * and also for regions reported as reserved by the e820.
+ */
+static void __init
+legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
+{
+        int i;
+        probe_roms();
+        for (i = 0; i < e820.nr_map; i++) {
+                struct resource *res;
+#ifndef CONFIG_RESOURCES_64BIT
+                if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
+                        continue;
+#endif
+                res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
+                switch (e820.map[i].type) {
+                case E820_RAM:  res->name = "System RAM"; break;
+                case E820_ACPI: res->name = "ACPI Tables"; break;
+                case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
+                default:        res->name = "reserved";
+                }
+                res->start = e820.map[i].addr;
+                res->end = res->start + e820.map[i].size - 1;
+                res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+                if (request_resource(&iomem_resource, res)) {
+                        kfree(res);
+                        continue;
+                }
+                if (e820.map[i].type == E820_RAM) {
+                        /*
+                         *  We don't know which RAM region contains kernel data,
+                         *  so we try it repeatedly and let the resource manager
+                         *  test it.
+                         */
+                        request_resource(res, code_resource);
+                        request_resource(res, data_resource);
+#ifdef CONFIG_KEXEC
+                        request_resource(res, &crashk_res);
+#endif
+                }
+        }
+}
+/*
+ * Request address space for all standard resources
+ *
+ * This is called just before pcibios_init(), which is also a
+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
+ */
+static int __init request_standard_resources(void)
+{
+        int i;
+        printk("Setting up standard PCI resources\n");
+        if (efi_enabled)
+                efi_initialize_iomem_resources(&code_resource, &data_resource);
+        else
+                legacy_init_iomem_resources(&code_resource, &data_resource);
+        /* EFI systems may still have VGA */
+        request_resource(&iomem_resource, &video_ram_resource);
+        /* request I/O space for devices used on all i[345]86 PCs */
+        for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+                request_resource(&ioport_resource, &standard_io_resources[i]);
+        return 0;
+}
+subsys_initcall(request_standard_resources);
+#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
+/**
+ * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
+ * correspond to e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation.
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(void)
+{
+        int i;
+        unsigned long pfn;
+        pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+        for (i = 1; i < e820.nr_map; i++) {
+                struct e820entry *ei = &e820.map[i];
+                if (pfn < PFN_UP(ei->addr))
+                        register_nosave_region(pfn, PFN_UP(ei->addr));
+                pfn = PFN_DOWN(ei->addr + ei->size);
+                if (ei->type != E820_RAM)
+                        register_nosave_region(PFN_UP(ei->addr), pfn);
+                if (pfn >= max_low_pfn)
+                        break;
+        }
+}
+#endif
+void __init add_memory_region(unsigned long long start,
+                              unsigned long long size, int type)
+{
+        int x;
+        if (!efi_enabled) {
+                x = e820.nr_map;
+                if (x == E820MAX) {
+                    printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+                    return;
+                }
+                e820.map[x].addr = start;
+                e820.map[x].size = size;
+                e820.map[x].type = type;
+                e820.nr_map++;
+        }
+} /* add_memory_region */
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries.  The following
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+{
+        struct change_member *change_tmp;
+        unsigned long current_type, last_type;
+        unsigned long long last_addr;
+        int chgidx, still_changing;
+        int overlap_entries;
+        int new_bios_entry;
+        int old_nr, new_nr, chg_nr;
+        int i;
+        /*
+                Visually we're performing the following (1,2,3,4 = memory types)...
+                Sample memory map (w/overlaps):
+                   ____22__________________
+                   ______________________4_
+                   ____1111________________
+                   _44_____________________
+                   11111111________________
+                   ____________________33__
+                   ___________44___________
+                   __________33333_________
+                   ______________22________
+                   ___________________2222_
+                   _________111111111______
+                   _____________________11_
+                   _________________4______
+                Sanitized equivalent (no overlap):
+                   1_______________________
+                   _44_____________________
+                   ___1____________________
+                   ____22__________________
+                   ______11________________
+                   _________1______________
+                   __________3_____________
+                   ___________44___________
+                   _____________33_________
+                   _______________2________
+                   ________________1_______
+                   _________________4______
+                   ___________________2____
+                   ____________________33__
+                   ______________________4_
+        */
+        /* if there's only one memory region, don't bother */
+        if (*pnr_map < 2) {
+                return -1;
+        }
+        old_nr = *pnr_map;
+        /* bail out if we find any unreasonable addresses in bios map */
+        for (i=0; i<old_nr; i++)
+                if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
+                        return -1;
+                }
+        /* create pointers for initial change-point information (for sorting) */
+        for (i=0; i < 2*old_nr; i++)
+                change_point[i] = &change_point_list[i];
+        /* record all known change-points (starting and ending addresses),
+           omitting those that are for empty memory regions */
+        chgidx = 0;
+        for (i=0; i < old_nr; i++)      {
+                if (biosmap[i].size != 0) {
+                        change_point[chgidx]->addr = biosmap[i].addr;
+                        change_point[chgidx++]->pbios = &biosmap[i];
+                        change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+                        change_point[chgidx++]->pbios = &biosmap[i];
+                }
+        }
+        chg_nr = chgidx;        /* true number of change-points */
+        /* sort change-point list by memory addresses (low -> high) */
+        still_changing = 1;
+        while (still_changing)  {
+                still_changing = 0;
+                for (i=1; i < chg_nr; i++)  {
+                        /* if <current_addr> > <last_addr>, swap */
+                        /* or, if current=<start_addr> & last=<end_addr>, swap */
+                        if ((change_point[i]->addr < change_point[i-1]->addr) ||
+                                ((change_point[i]->addr == change_point[i-1]->addr) &&
+                                 (change_point[i]->addr == change_point[i]->pbios->addr) &&
+                                 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
+                           )
+                        {
+                                change_tmp = change_point[i];
+                                change_point[i] = change_point[i-1];
+                                change_point[i-1] = change_tmp;
+                                still_changing=1;
+                        }
+                }
+        }
+        /* create a new bios memory map, removing overlaps */
+        overlap_entries=0;       /* number of entries in the overlap table */
+        new_bios_entry=0;        /* index for creating new bios map entries */
+        last_type = 0;           /* start with undefined memory type */
+        last_addr = 0;           /* start with 0 as last starting address */
+        /* loop through change-points, determining affect on the new bios map */
+        for (chgidx=0; chgidx < chg_nr; chgidx++)
+        {
+                /* keep track of all overlapping bios entries */
+                if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
+                {
+                        /* add map entry to overlap list (> 1 entry implies an overlap) */
+                        overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
+                }
+                else
+                {
+                        /* remove entry from list (order independent, so swap with last) */
+                        for (i=0; i<overlap_entries; i++)
+                        {
+                                if (overlap_list[i] == change_point[chgidx]->pbios)
+                                        overlap_list[i] = overlap_list[overlap_entries-1];
+                        }
+                        overlap_entries--;
+                }
+                /* if there are overlapping entries, decide which "type" to use */
+                /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+                current_type = 0;
+                for (i=0; i<overlap_entries; i++)
+                        if (overlap_list[i]->type > current_type)
+                                current_type = overlap_list[i]->type;
+                /* continue building up new bios map based on this information */
+                if (current_type != last_type)  {
+                        if (last_type != 0)      {
+                                new_bios[new_bios_entry].size =
+                                        change_point[chgidx]->addr - last_addr;
+                                /* move forward only if the new size was non-zero */
+                                if (new_bios[new_bios_entry].size != 0)
+                                        if (++new_bios_entry >= E820MAX)
+                                                break;  /* no more space left for new bios entries */
+                        }
+                        if (current_type != 0)  {
+                                new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+                                new_bios[new_bios_entry].type = current_type;
+                                last_addr=change_point[chgidx]->addr;
+                        }
+                        last_type = current_type;
+                }
+        }
+        new_nr = new_bios_entry;   /* retain count for new bios entries */
+        /* copy new bios mapping into original location */
+        memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+        *pnr_map = new_nr;
+        return 0;
+}
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ *
+ * We check to see that the memory map contains at least 2 elements
+ * before we'll use it, because the detection code in setup.S may
+ * not be perfect and most every PC known to man has two memory
+ * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
+ * thinkpad 560x, for example, does not cooperate with the memory
+ * detection code.)
+ */
+int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+{
+        /* Only one memory region (or negative)? Ignore it */
+        if (nr_map < 2)
+                return -1;
+        do {
+                unsigned long long start = biosmap->addr;
+                unsigned long long size = biosmap->size;
+                unsigned long long end = start + size;
+                unsigned long type = biosmap->type;
+                /* Overflow in 64 bits? Ignore the memory map. */
+                if (start > end)
+                        return -1;
+                /*
+                 * Some BIOSes claim RAM in the 640k - 1M region.
+                 * Not right. Fix it up.
+                 */
+                if (type == E820_RAM) {
+                        if (start < 0x100000ULL && end > 0xA0000ULL) {
+                                if (start < 0xA0000ULL)
+                                        add_memory_region(start, 0xA0000ULL-start, type);
+                                if (end <= 0x100000ULL)
+                                        continue;
+                                start = 0x100000ULL;
+                                size = end - start;
+                        }
+                }
+                add_memory_region(start, size, type);
+        } while (biosmap++,--nr_map);
+        return 0;
+}
+/*
+ * Callback for efi_memory_walk.
+ */
+static int __init
+efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
+{
+        unsigned long *max_pfn = arg, pfn;
+        if (start < end) {
+                pfn = PFN_UP(end -1);
+                if (pfn > *max_pfn)
+                        *max_pfn = pfn;
+        }
+        return 0;
+}
+static int __init
+efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
+{
+        memory_present(0, PFN_UP(start), PFN_DOWN(end));
+        return 0;
+}
+/*
+ * Find the highest page frame number we have available
+ */
+void __init find_max_pfn(void)
+{
+        int i;
+        max_pfn = 0;
+        if (efi_enabled) {
+                efi_memmap_walk(efi_find_max_pfn, &max_pfn);
+                efi_memmap_walk(efi_memory_present_wrapper, NULL);
+                return;
+        }
+        for (i = 0; i < e820.nr_map; i++) {
+                unsigned long start, end;
+                /* RAM? */
+                if (e820.map[i].type != E820_RAM)
+                        continue;
+                start = PFN_UP(e820.map[i].addr);
+                end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+                if (start >= end)
+                        continue;
+                if (end > max_pfn)
+                        max_pfn = end;
+                memory_present(0, start, end);
+        }
+}
+/*
+ * Free all available memory for boot time allocation.  Used
+ * as a callback function by efi_memory_walk()
+ */
+static int __init
+free_available_memory(unsigned long start, unsigned long end, void *arg)
+{
+        /* check max_low_pfn */
+        if (start >= (max_low_pfn << PAGE_SHIFT))
+                return 0;
+        if (end >= (max_low_pfn << PAGE_SHIFT))
+                end = max_low_pfn << PAGE_SHIFT;
+        if (start < end)
+                free_bootmem(start, end - start);
+        return 0;
+}
+/*
+ * Register fully available low RAM pages with the bootmem allocator.
+ */
+void __init register_bootmem_low_pages(unsigned long max_low_pfn)
+{
+        int i;
+        if (efi_enabled) {
+                efi_memmap_walk(free_available_memory, NULL);
+                return;
+        }
+        for (i = 0; i < e820.nr_map; i++) {
+                unsigned long curr_pfn, last_pfn, size;
+                /*
+                 * Reserve usable low memory
+                 */
+                if (e820.map[i].type != E820_RAM)
+                        continue;
+                /*
+                 * We are rounding up the start address of usable memory:
+                 */
+                curr_pfn = PFN_UP(e820.map[i].addr);
+                if (curr_pfn >= max_low_pfn)
+                        continue;
+                /*
+                 * ... and at the end of the usable range downwards:
+                 */
+                last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+                if (last_pfn > max_low_pfn)
+                        last_pfn = max_low_pfn;
+                /*
+                 * .. finally, did all the rounding and playing
+                 * around just make the area go away?
+                 */
+                if (last_pfn <= curr_pfn)
+                        continue;
+                size = last_pfn - curr_pfn;
+                free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
+        }
+}
+void __init e820_register_memory(void)
+{
+        unsigned long gapstart, gapsize, round;
+        unsigned long long last;
+        int i;
+        /*
+         * Search for the bigest gap in the low 32 bits of the e820
+         * memory space.
+         */
+        last = 0x100000000ull;
+        gapstart = 0x10000000;
+        gapsize = 0x400000;
+        i = e820.nr_map;
+        while (--i >= 0) {
+                unsigned long long start = e820.map[i].addr;
+                unsigned long long end = start + e820.map[i].size;
+                /*
+                 * Since "last" is at most 4GB, we know we'll
+                 * fit in 32 bits if this condition is true
+                 */
+                if (last > end) {
+                        unsigned long gap = last - end;
+                        if (gap > gapsize) {
+                                gapsize = gap;
+                                gapstart = end;
+                        }
+                }
+                if (start < last)
+                        last = start;
+        }
+        /*
+         * See how much we want to round up: start off with
+         * rounding to the next 1MB area.
+         */
+        round = 0x100000;
+        while ((gapsize >> 4) > round)
+                round += round;
+        /* Fun with two's complement */
+        pci_mem_start = (gapstart + round) & -round;
+        printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
+                pci_mem_start, gapstart, gapsize);
+}
+void __init print_memory_map(char *who)
+{
+        int i;
+        for (i = 0; i < e820.nr_map; i++) {
+                printk(" %s: %016Lx - %016Lx ", who,
+                        e820.map[i].addr,
+                        e820.map[i].addr + e820.map[i].size);
+                switch (e820.map[i].type) {
+                case E820_RAM:  printk("(usable)\n");
+                                break;
+                case E820_RESERVED:
+                                printk("(reserved)\n");
+                                break;
+                case E820_ACPI:
+                                printk("(ACPI data)\n");
+                                break;
+                case E820_NVS:
+                                printk("(ACPI NVS)\n");
+                                break;
+                default:        printk("type %u\n", e820.map[i].type);
+                                break;
+                }
+        }
+}
+static __init __always_inline void efi_limit_regions(unsigned long long size)
+{
+        unsigned long long current_addr = 0;
+        efi_memory_desc_t *md, *next_md;
+        void *p, *p1;
+        int i, j;
+        j = 0;
+        p1 = memmap.map;
+        for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
+                md = p;
+                next_md = p1;
+                current_addr = md->phys_addr +
+                        PFN_PHYS(md->num_pages);
+                if (is_available_memory(md)) {
+                        if (md->phys_addr >= size) continue;
+                        memcpy(next_md, md, memmap.desc_size);
+                        if (current_addr >= size) {
+                                next_md->num_pages -=
+                                        PFN_UP(current_addr-size);
+                        }
+                        p1 += memmap.desc_size;
+                        next_md = p1;
+                        j++;
+                } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
+                           EFI_MEMORY_RUNTIME) {
+                        /* In order to make runtime services
+                         * available we have to include runtime
+                         * memory regions in memory map */
+                        memcpy(next_md, md, memmap.desc_size);
+                        p1 += memmap.desc_size;
+                        next_md = p1;
+                        j++;
+                }
+        }
+        memmap.nr_map = j;
+        memmap.map_end = memmap.map +
+                (memmap.nr_map * memmap.desc_size);
+}
+void __init limit_regions(unsigned long long size)
+{
+        unsigned long long current_addr;
+        int i;
+        print_memory_map("limit_regions start");
+        if (efi_enabled) {
+                efi_limit_regions(size);
+                return;
+        }
+        for (i = 0; i < e820.nr_map; i++) {
+                current_addr = e820.map[i].addr + e820.map[i].size;
+                if (current_addr < size)
+                        continue;
+                if (e820.map[i].type != E820_RAM)
+                        continue;
+                if (e820.map[i].addr >= size) {
+                        /*
+                         * This region starts past the end of the
+                         * requested size, skip it completely.
+                         */
+                        e820.nr_map = i;
+                } else {
+                        e820.nr_map = i + 1;
+                        e820.map[i].size -= current_addr - size;
+                }
+                print_memory_map("limit_regions endfor");
+                return;
+        }
+        print_memory_map("limit_regions endfunc");
+}
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(u64 start, u64 end, unsigned type)
+{
+        int i;
+        for (i = 0; i < e820.nr_map; i++) {
+                const struct e820entry *ei = &e820.map[i];
+                if (type && ei->type != type)
+                        continue;
+                if (ei->addr >= end || ei->addr + ei->size <= start)
+                        continue;
+                return 1;
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+ /*
+  * This function checks if the entire range <start,end> is mapped with type.
+  *
+  * Note: this function only works correct if the e820 table is sorted and
+  * not-overlapping, which is the case
+  */
+int __init
+e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
+{
+        u64 start = s;
+        u64 end = e;
+        int i;
+        for (i = 0; i < e820.nr_map; i++) {
+                struct e820entry *ei = &e820.map[i];
+                if (type && ei->type != type)
+                        continue;
+                /* is the region (part) in overlap with the current region ?*/
+                if (ei->addr >= end || ei->addr + ei->size <= start)
+                        continue;
+                /* if the region is at the beginning of <start,end> we move
+                 * start to the end of the region since it's ok until there
+                 */
+                if (ei->addr <= start)
+                        start = ei->addr + ei->size;
+                /* if start is now at or beyond end, we're done, full
+                 * coverage */
+                if (start >= end)
+                        return 1; /* we're done */
+        }
+        return 0;
+}
+static int __init parse_memmap(char *arg)
+{
+        if (!arg)
+                return -EINVAL;
+        if (strcmp(arg, "exactmap") == 0) {
+#ifdef CONFIG_CRASH_DUMP
+                /* If we are doing a crash dump, we
+                 * still need to know the real mem
+                 * size before original memory map is
+                 * reset.
+                 */
+                find_max_pfn();
+                saved_max_pfn = max_pfn;
+#endif
+                e820.nr_map = 0;
+                user_defined_memmap = 1;
+        } else {
+                /* If the user specifies memory size, we
+                 * limit the BIOS-provided memory map to
+                 * that size. exactmap can be used to specify
+                 * the exact map. mem=number can be used to
+                 * trim the existing memory map.
+                 */
+                unsigned long long start_at, mem_size;
+                mem_size = memparse(arg, &arg);
+                if (*arg == '@') {
+                        start_at = memparse(arg+1, &arg);
+                        add_memory_region(start_at, mem_size, E820_RAM);
+                } else if (*arg == '#') {
+                        start_at = memparse(arg+1, &arg);
+                        add_memory_region(start_at, mem_size, E820_ACPI);
+                } else if (*arg == '$') {
+                        start_at = memparse(arg+1, &arg);
+                        add_memory_region(start_at, mem_size, E820_RESERVED);
+                } else {
+                        limit_regions(mem_size);
+                        user_defined_memmap = 1;
+                }
+        }
+        return 0;
+}
+early_param("memmap", parse_memmap);
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
new file mode 100644
index 000000000000..0f4d5e209e9b
--- /dev/null
+++ b/arch/x86/kernel/e820_64.c
@@ -0,0 +1,725 @@
+/* 
+ * Handle the memory map.
+ * The functions here do the job until bootmem takes over.
+ *
+ *  Getting sanitize_e820_map() in sync with i386 version by applying change:
+ *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *     Alex Achenbach <xela@slit.de>, December 2002.
+ *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/pfn.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/bootsetup.h>
+#include <asm/sections.h>
+struct e820map e820;
+/* 
+ * PFN of last memory page.
+ */
+unsigned long end_pfn; 
+EXPORT_SYMBOL(end_pfn);
+/* 
+ * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
+ * The direct mapping extends to end_pfn_map, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */ 
+unsigned long end_pfn_map; 
+/* 
+ * Last pfn which the user wants to use.
+ */
+static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
+extern struct resource code_resource, data_resource;
+/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 
+static inline int bad_addr(unsigned long *addrp, unsigned long size)
+{ 
+        unsigned long addr = *addrp, last = addr + size; 
+        /* various gunk below that needed for SMP startup */
+        if (addr < 0x8000) { 
+                *addrp = PAGE_ALIGN(0x8000);
+                return 1; 
+        }
+        /* direct mapping tables of the kernel */
+        if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { 
+                *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
+                return 1;
+        } 
+        /* initrd */ 
+#ifdef CONFIG_BLK_DEV_INITRD
+        if (LOADER_TYPE && INITRD_START && last >= INITRD_START && 
+            addr < INITRD_START+INITRD_SIZE) { 
+                *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE);
+                return 1;
+        } 
+#endif
+        /* kernel code */
+        if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
+                *addrp = PAGE_ALIGN(__pa_symbol(&_end));
+                return 1;
+        }
+        if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
+                *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
+                return 1;
+        }
+#ifdef CONFIG_NUMA
+        /* NUMA memory to node map */
+        if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
+                *addrp = nodemap_addr + nodemap_size;
+                return 1;
+        }
+#endif
+        /* XXX ramdisk image here? */ 
+        return 0;
+} 
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
+{ 
+        int i;
+        for (i = 0; i < e820.nr_map; i++) { 
+                struct e820entry *ei = &e820.map[i]; 
+                if (type && ei->type != type) 
+                        continue;
+                if (ei->addr >= end || ei->addr + ei->size <= start)
+                        continue; 
+                return 1; 
+        } 
+        return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
+{
+        int i;
+        for (i = 0; i < e820.nr_map; i++) {
+                struct e820entry *ei = &e820.map[i];
+                if (type && ei->type != type)
+                        continue;
+                /* is the region (part) in overlap with the current region ?*/
+                if (ei->addr >= end || ei->addr + ei->size <= start)
+                        continue;
+                /* if the region is at the beginning of <start,end> we move
+                 * start to the end of the region since it's ok until there
+                 */
+                if (ei->addr <= start)
+                        start = ei->addr + ei->size;
+                /* if start is now at or beyond end, we're done, full coverage */
+                if (start >= end)
+                        return 1; /* we're done */
+        }
+        return 0;
+}
+/* 
+ * Find a free area in a specific range. 
+ */ 
+unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 
+{ 
+        int i; 
+        for (i = 0; i < e820.nr_map; i++) { 
+                struct e820entry *ei = &e820.map[i]; 
+                unsigned long addr = ei->addr, last; 
+                if (ei->type != E820_RAM) 
+                        continue; 
+                if (addr < start) 
+                        addr = start;
+                if (addr > ei->addr + ei->size) 
+                        continue; 
+                while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
+                        ;
+                last = PAGE_ALIGN(addr) + size;
+                if (last > ei->addr + ei->size)
+                        continue;
+                if (last > end) 
+                        continue;
+                return addr; 
+        } 
+        return -1UL;            
+} 
+/*
+ * Find the highest page frame number we have available
+ */
+unsigned long __init e820_end_of_ram(void)
+{
+        unsigned long end_pfn = 0;
+        end_pfn = find_max_pfn_with_active_regions();
+        
+        if (end_pfn > end_pfn_map) 
+                end_pfn_map = end_pfn;
+        if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
+                end_pfn_map = MAXMEM>>PAGE_SHIFT;
+        if (end_pfn > end_user_pfn)
+                end_pfn = end_user_pfn;
+        if (end_pfn > end_pfn_map) 
+                end_pfn = end_pfn_map; 
+        printk("end_pfn_map = %lu\n", end_pfn_map);
+        return end_pfn; 
+}
+/*
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+void __init e820_reserve_resources(void)
+{
+        int i;
+        for (i = 0; i < e820.nr_map; i++) {
+                struct resource *res;
+                res = alloc_bootmem_low(sizeof(struct resource));
+                switch (e820.map[i].type) {
+                case E820_RAM:  res->name = "System RAM"; break;
+                case E820_ACPI: res->name = "ACPI Tables"; break;
+                case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
+                default:        res->name = "reserved";
+                }
+                res->start = e820.map[i].addr;
+                res->end = res->start + e820.map[i].size - 1;
+                res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+                request_resource(&iomem_resource, res);
+                if (e820.map[i].type == E820_RAM) {
+                        /*
+                         *  We don't know which RAM region contains kernel data,
+                         *  so we try it repeatedly and let the resource manager
+                         *  test it.
+                         */
+                        request_resource(res, &code_resource);
+                        request_resource(res, &data_resource);
+#ifdef CONFIG_KEXEC
+                        request_resource(res, &crashk_res);
+#endif
+                }
+        }
+}
+/*
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for software
+ * suspend and suspend to RAM.
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(void)
+{
+        int i;
+        unsigned long paddr;
+        paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
+        for (i = 1; i < e820.nr_map; i++) {
+                struct e820entry *ei = &e820.map[i];
+                if (paddr < ei->addr)
+                        register_nosave_region(PFN_DOWN(paddr),
+                                                PFN_UP(ei->addr));
+                paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
+                if (ei->type != E820_RAM)
+                        register_nosave_region(PFN_UP(ei->addr),
+                                                PFN_DOWN(paddr));
+                if (paddr >= (end_pfn << PAGE_SHIFT))
+                        break;
+        }
+}
+/*
+ * Finds an active region in the address range from start_pfn to end_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
+ */
+static int __init e820_find_active_region(const struct e820entry *ei,
+                                          unsigned long start_pfn,
+                                          unsigned long end_pfn,
+                                          unsigned long *ei_startpfn,
+                                          unsigned long *ei_endpfn)
+{
+        *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
+        *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
+        /* Skip map entries smaller than a page */
+        if (*ei_startpfn >= *ei_endpfn)
+                return 0;
+        /* Check if end_pfn_map should be updated */
+        if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
+                end_pfn_map = *ei_endpfn;
+        /* Skip if map is outside the node */
+        if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
+                                    *ei_startpfn >= end_pfn)
+                return 0;
+        /* Check for overlaps */
+        if (*ei_startpfn < start_pfn)
+                *ei_startpfn = start_pfn;
+        if (*ei_endpfn > end_pfn)
+                *ei_endpfn = end_pfn;
+        /* Obey end_user_pfn to save on memmap */
+        if (*ei_startpfn >= end_user_pfn)
+                return 0;
+        if (*ei_endpfn > end_user_pfn)
+                *ei_endpfn = end_user_pfn;
+        return 1;
+}
+/* Walk the e820 map and register active regions within a node */
+void __init
+e820_register_active_regions(int nid, unsigned long start_pfn,
+                                                        unsigned long end_pfn)
+{
+        unsigned long ei_startpfn;
+        unsigned long ei_endpfn;
+        int i;
+        for (i = 0; i < e820.nr_map; i++)
+                if (e820_find_active_region(&e820.map[i],
+                                            start_pfn, end_pfn,
+                                            &ei_startpfn, &ei_endpfn))
+                        add_active_range(nid, ei_startpfn, ei_endpfn);
+}
+/* 
+ * Add a memory region to the kernel e820 map.
+ */ 
+void __init add_memory_region(unsigned long start, unsigned long size, int type)
+{
+        int x = e820.nr_map;
+        if (x == E820MAX) {
+                printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+                return;
+        }
+        e820.map[x].addr = start;
+        e820.map[x].size = size;
+        e820.map[x].type = type;
+        e820.nr_map++;
+}
+/*
+ * Find the hole size (in bytes) in the memory range.
+ * @start: starting address of the memory range to scan
+ * @end: ending address of the memory range to scan
+ */
+unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
+{
+        unsigned long start_pfn = start >> PAGE_SHIFT;
+        unsigned long end_pfn = end >> PAGE_SHIFT;
+        unsigned long ei_startpfn;
+        unsigned long ei_endpfn;
+        unsigned long ram = 0;
+        int i;
+        for (i = 0; i < e820.nr_map; i++) {
+                if (e820_find_active_region(&e820.map[i],
+                                            start_pfn, end_pfn,
+                                            &ei_startpfn, &ei_endpfn))
+                        ram += ei_endpfn - ei_startpfn;
+        }
+        return end - start - (ram << PAGE_SHIFT);
+}
+void __init e820_print_map(char *who)
+{
+        int i;
+        for (i = 0; i < e820.nr_map; i++) {
+                printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+                        (unsigned long long) e820.map[i].addr,
+                        (unsigned long long) (e820.map[i].addr + e820.map[i].size));
+                switch (e820.map[i].type) {
+                case E820_RAM:  printk("(usable)\n");
+                                break;
+                case E820_RESERVED:
+                                printk("(reserved)\n");
+                                break;
+                case E820_ACPI:
+                                printk("(ACPI data)\n");
+                                break;
+                case E820_NVS:
+                                printk("(ACPI NVS)\n");
+                                break;
+                default:        printk("type %u\n", e820.map[i].type);
+                                break;
+                }
+        }
+}
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries.  The following 
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+{
+        struct change_member {
+                struct e820entry *pbios; /* pointer to original bios entry */
+                unsigned long long addr; /* address for this change point */
+        };
+        static struct change_member change_point_list[2*E820MAX] __initdata;
+        static struct change_member *change_point[2*E820MAX] __initdata;
+        static struct e820entry *overlap_list[E820MAX] __initdata;
+        static struct e820entry new_bios[E820MAX] __initdata;
+        struct change_member *change_tmp;
+        unsigned long current_type, last_type;
+        unsigned long long last_addr;
+        int chgidx, still_changing;
+        int overlap_entries;
+        int new_bios_entry;
+        int old_nr, new_nr, chg_nr;
+        int i;
+        /*
+                Visually we're performing the following (1,2,3,4 = memory types)...
+                Sample memory map (w/overlaps):
+                   ____22__________________
+                   ______________________4_
+                   ____1111________________
+                   _44_____________________
+                   11111111________________
+                   ____________________33__
+                   ___________44___________
+                   __________33333_________
+                   ______________22________
+                   ___________________2222_
+                   _________111111111______
+                   _____________________11_
+                   _________________4______
+                Sanitized equivalent (no overlap):
+                   1_______________________
+                   _44_____________________
+                   ___1____________________
+                   ____22__________________
+                   ______11________________
+                   _________1______________
+                   __________3_____________
+                   ___________44___________
+                   _____________33_________
+                   _______________2________
+                   ________________1_______
+                   _________________4______
+                   ___________________2____
+                   ____________________33__
+                   ______________________4_
+        */
+        /* if there's only one memory region, don't bother */
+        if (*pnr_map < 2)
+                return -1;
+        old_nr = *pnr_map;
+        /* bail out if we find any unreasonable addresses in bios map */
+        for (i=0; i<old_nr; i++)
+                if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+                        return -1;
+        /* create pointers for initial change-point information (for sorting) */
+        for (i=0; i < 2*old_nr; i++)
+                change_point[i] = &change_point_list[i];
+        /* record all known change-points (starting and ending addresses),
+           omitting those that are for empty memory regions */
+        chgidx = 0;
+        for (i=0; i < old_nr; i++)      {
+                if (biosmap[i].size != 0) {
+                        change_point[chgidx]->addr = biosmap[i].addr;
+                        change_point[chgidx++]->pbios = &biosmap[i];
+                        change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+                        change_point[chgidx++]->pbios = &biosmap[i];
+                }
+        }
+        chg_nr = chgidx;
+        /* sort change-point list by memory addresses (low -> high) */
+        still_changing = 1;
+        while (still_changing)  {
+                still_changing = 0;
+                for (i=1; i < chg_nr; i++)  {
+                        /* if <current_addr> > <last_addr>, swap */
+                        /* or, if current=<start_addr> & last=<end_addr>, swap */
+                        if ((change_point[i]->addr < change_point[i-1]->addr) ||
+                                ((change_point[i]->addr == change_point[i-1]->addr) &&
+                                 (change_point[i]->addr == change_point[i]->pbios->addr) &&
+                                 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
+                           )
+                        {
+                                change_tmp = change_point[i];
+                                change_point[i] = change_point[i-1];
+                                change_point[i-1] = change_tmp;
+                                still_changing=1;
+                        }
+                }
+        }
+        /* create a new bios memory map, removing overlaps */
+        overlap_entries=0;       /* number of entries in the overlap table */
+        new_bios_entry=0;        /* index for creating new bios map entries */
+        last_type = 0;           /* start with undefined memory type */
+        last_addr = 0;           /* start with 0 as last starting address */
+        /* loop through change-points, determining affect on the new bios map */
+        for (chgidx=0; chgidx < chg_nr; chgidx++)
+        {
+                /* keep track of all overlapping bios entries */
+                if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
+                {
+                        /* add map entry to overlap list (> 1 entry implies an overlap) */
+                        overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
+                }
+                else
+                {
+                        /* remove entry from list (order independent, so swap with last) */
+                        for (i=0; i<overlap_entries; i++)
+                        {
+                                if (overlap_list[i] == change_point[chgidx]->pbios)
+                                        overlap_list[i] = overlap_list[overlap_entries-1];
+                        }
+                        overlap_entries--;
+                }
+                /* if there are overlapping entries, decide which "type" to use */
+                /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+                current_type = 0;
+                for (i=0; i<overlap_entries; i++)
+                        if (overlap_list[i]->type > current_type)
+                                current_type = overlap_list[i]->type;
+                /* continue building up new bios map based on this information */
+                if (current_type != last_type)  {
+                        if (last_type != 0)      {
+                                new_bios[new_bios_entry].size =
+                                        change_point[chgidx]->addr - last_addr;
+                                /* move forward only if the new size was non-zero */
+                                if (new_bios[new_bios_entry].size != 0)
+                                        if (++new_bios_entry >= E820MAX)
+                                                break;  /* no more space left for new bios entries */
+                        }
+                        if (current_type != 0)  {
+                                new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+                                new_bios[new_bios_entry].type = current_type;
+                                last_addr=change_point[chgidx]->addr;
+                        }
+                        last_type = current_type;
+                }
+        }
+        new_nr = new_bios_entry;   /* retain count for new bios entries */
+        /* copy new bios mapping into original location */
+        memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+        *pnr_map = new_nr;
+        return 0;
+}
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ */
+static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+{
+        /* Only one memory region (or negative)? Ignore it */
+        if (nr_map < 2)
+                return -1;
+        do {
+                unsigned long start = biosmap->addr;
+                unsigned long size = biosmap->size;
+                unsigned long end = start + size;
+                unsigned long type = biosmap->type;
+                /* Overflow in 64 bits? Ignore the memory map. */
+                if (start > end)
+                        return -1;
+                add_memory_region(start, size, type);
+        } while (biosmap++,--nr_map);
+        return 0;
+}
+void early_panic(char *msg)
+{
+        early_printk(msg);
+        panic(msg);
+}
+void __init setup_memory_region(void)
+{
+        /*
+         * Try to copy the BIOS-supplied E820-map.
+         *
+         * Otherwise fake a memory map; one section from 0k->640k,
+         * the next section from 1mb->appropriate_mem_k
+         */
+        sanitize_e820_map(E820_MAP, &E820_MAP_NR);
+        if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
+                early_panic("Cannot find a valid memory map");
+        printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+        e820_print_map("BIOS-e820");
+}
+static int __init parse_memopt(char *p)
+{
+        if (!p)
+                return -EINVAL;
+        end_user_pfn = memparse(p, &p);
+        end_user_pfn >>= PAGE_SHIFT;    
+        return 0;
+} 
+early_param("mem", parse_memopt);
+static int userdef __initdata;
+static int __init parse_memmap_opt(char *p)
+{
+        char *oldp;
+        unsigned long long start_at, mem_size;
+        if (!strcmp(p, "exactmap")) {
+#ifdef CONFIG_CRASH_DUMP
+                /* If we are doing a crash dump, we
+                 * still need to know the real mem
+                 * size before original memory map is
+                 * reset.
+                 */
+                e820_register_active_regions(0, 0, -1UL);
+                saved_max_pfn = e820_end_of_ram();
+                remove_all_active_ranges();
+#endif
+                end_pfn_map = 0;
+                e820.nr_map = 0;
+                userdef = 1;
+                return 0;
+        }
+        oldp = p;
+        mem_size = memparse(p, &p);
+        if (p == oldp)
+                return -EINVAL;
+        if (*p == '@') {
+                start_at = memparse(p+1, &p);
+                add_memory_region(start_at, mem_size, E820_RAM);
+        } else if (*p == '#') {
+                start_at = memparse(p+1, &p);
+                add_memory_region(start_at, mem_size, E820_ACPI);
+        } else if (*p == '$') {
+                start_at = memparse(p+1, &p);
+                add_memory_region(start_at, mem_size, E820_RESERVED);
+        } else {
+                end_user_pfn = (mem_size >> PAGE_SHIFT);
+        }
+        return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+void __init finish_e820_parsing(void)
+{
+        if (userdef) {
+                printk(KERN_INFO "user-defined physical RAM map:\n");
+                e820_print_map("user");
+        }
+}
+unsigned long pci_mem_start = 0xaeedbabe;
+EXPORT_SYMBOL(pci_mem_start);
+/*
+ * Search for the biggest gap in the low 32 bits of the e820
+ * memory space.  We pass this space to PCI to assign MMIO resources
+ * for hotplug or unconfigured devices in.
+ * Hopefully the BIOS let enough space left.
+ */
+__init void e820_setup_gap(void)
+{
+        unsigned long gapstart, gapsize, round;
+        unsigned long last;
+        int i;
+        int found = 0;
+        last = 0x100000000ull;
+        gapstart = 0x10000000;
+        gapsize = 0x400000;
+        i = e820.nr_map;
+        while (--i >= 0) {
+                unsigned long long start = e820.map[i].addr;
+                unsigned long long end = start + e820.map[i].size;
+                /*
+                 * Since "last" is at most 4GB, we know we'll
+                 * fit in 32 bits if this condition is true
+                 */
+                if (last > end) {
+                        unsigned long gap = last - end;
+                        if (gap > gapsize) {
+                                gapsize = gap;
+                                gapstart = end;
+                                found = 1;
+                        }
+                }
+                if (start < last)
+                        last = start;
+        }
+        if (!found) {
+                gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
+                printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
+                       KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
+        }
+        /*
+         * See how much we want to round up: start off with
+         * rounding to the next 1MB area.
+         */
+        round = 0x100000;
+        while ((gapsize >> 4) > round)
+                round += round;
+        /* Fun with two's complement */
+        pci_mem_start = (gapstart + round) & -round;
+        printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
+                pci_mem_start, gapstart, gapsize);
+}
diff --git a/arch/x86/kernel/early-quirks_64.c b/arch/x86/kernel/early-quirks_64.c
new file mode 100644
index 000000000000..13aa4fd728f3
--- /dev/null
+++ b/arch/x86/kernel/early-quirks_64.c
@@ -0,0 +1,127 @@
+/* Various workarounds for chipset bugs.
+   This code runs very early and can't use the regular PCI subsystem
+   The entries are keyed to PCI bridges which usually identify chipsets
+   uniquely.
+   This is only for whole classes of chipsets with specific problems which
+   need early invasive action (e.g. before the timers are initialized).
+   Most PCI device specific workarounds can be done later and should be
+   in standard PCI quirks
+   Mainboard specific bugs should be handled by DMI entries.
+   CPU specific bugs in setup.c */
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/pci_ids.h>
+#include <asm/pci-direct.h>
+#include <asm/proto.h>
+#include <asm/iommu.h>
+#include <asm/dma.h>
+static void __init via_bugs(void)
+{
+#ifdef CONFIG_IOMMU
+        if ((end_pfn > MAX_DMA32_PFN ||  force_iommu) &&
+            !iommu_aperture_allowed) {
+                printk(KERN_INFO
+  "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n");
+                iommu_aperture_disabled = 1;
+        }
+#endif
+}
+#ifdef CONFIG_ACPI
+static int __init nvidia_hpet_check(struct acpi_table_header *header)
+{
+        return 0;
+}
+#endif
+static void __init nvidia_bugs(void)
+{
+#ifdef CONFIG_ACPI
+        /*
+         * All timer overrides on Nvidia are
+         * wrong unless HPET is enabled.
+         * Unfortunately that's not true on many Asus boards.
+         * We don't know yet how to detect this automatically, but
+         * at least allow a command line override.
+         */
+        if (acpi_use_timer_override)
+                return;
+        if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) {
+                acpi_skip_timer_override = 1;
+                printk(KERN_INFO "Nvidia board "
+                       "detected. Ignoring ACPI "
+                       "timer override.\n");
+                printk(KERN_INFO "If you got timer trouble "
+                        "try acpi_use_timer_override\n");
+        }
+#endif
+        /* RED-PEN skip them on mptables too? */
+}
+static void __init ati_bugs(void)
+{
+        if (timer_over_8254 == 1) {
+                timer_over_8254 = 0;
+                printk(KERN_INFO
+                "ATI board detected. Disabling timer routing over 8254.\n");
+        }
+}
+struct chipset {
+        u16 vendor;
+        void (*f)(void);
+};
+static struct chipset early_qrk[] __initdata = {
+        { PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
+        { PCI_VENDOR_ID_VIA, via_bugs },
+        { PCI_VENDOR_ID_ATI, ati_bugs },
+        {}
+};
+void __init early_quirks(void)
+{
+        int num, slot, func;
+        if (!early_pci_allowed())
+                return;
+        /* Poor man's PCI discovery */
+        for (num = 0; num < 32; num++) {
+                for (slot = 0; slot < 32; slot++) {
+                        for (func = 0; func < 8; func++) {
+                                u32 class;
+                                u32 vendor;
+                                u8 type;
+                                int i;
+                                class = read_pci_config(num,slot,func,
+                                                        PCI_CLASS_REVISION);
+                                if (class == 0xffffffff)
+                                        break;
+                                if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
+                                        continue;
+                                vendor = read_pci_config(num, slot, func,
+                                                         PCI_VENDOR_ID);
+                                vendor &= 0xffff;
+                                for (i = 0; early_qrk[i].f; i++)
+                                        if (early_qrk[i].vendor == vendor) {
+                                                early_qrk[i].f();
+                                                return;
+                                        }
+                                type = read_pci_config_byte(num, slot, func,
+                                                            PCI_HEADER_TYPE);
+                                if (!(type & 0x80))
+                                        break;
+                        }
+                }
+        }
+}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
new file mode 100644
index 000000000000..fd9aff3f3890
--- /dev/null
+++ b/arch/x86/kernel/early_printk.c
@@ -0,0 +1,259 @@
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/screen_info.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/fcntl.h>
+#include <xen/hvc-console.h>
+/* Simple VGA output */
+#ifdef __i386__
+#include <asm/setup.h>
+#else
+#include <asm/bootsetup.h>
+#endif
+#define VGABASE         (__ISA_IO_base + 0xb8000)
+static int max_ypos = 25, max_xpos = 80;
+static int current_ypos = 25, current_xpos = 0;
+static void early_vga_write(struct console *con, const char *str, unsigned n)
+{
+        char c;
+        int  i, k, j;
+        while ((c = *str++) != '\0' && n-- > 0) {
+                if (current_ypos >= max_ypos) {
+                        /* scroll 1 line up */
+                        for (k = 1, j = 0; k < max_ypos; k++, j++) {
+                                for (i = 0; i < max_xpos; i++) {
+                                        writew(readw(VGABASE+2*(max_xpos*k+i)),
+                                               VGABASE + 2*(max_xpos*j + i));
+                                }
+                        }
+                        for (i = 0; i < max_xpos; i++)
+                                writew(0x720, VGABASE + 2*(max_xpos*j + i));
+                        current_ypos = max_ypos-1;
+                }
+                if (c == '\n') {
+                        current_xpos = 0;
+                        current_ypos++;
+                } else if (c != '\r')  {
+                        writew(((0x7 << 8) | (unsigned short) c),
+                               VGABASE + 2*(max_xpos*current_ypos +
+                                                current_xpos++));
+                        if (current_xpos >= max_xpos) {
+                                current_xpos = 0;
+                                current_ypos++;
+                        }
+                }
+        }
+}
+static struct console early_vga_console = {
+        .name =         "earlyvga",
+        .write =        early_vga_write,
+        .flags =        CON_PRINTBUFFER,
+        .index =        -1,
+};
+/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
+static int early_serial_base = 0x3f8;  /* ttyS0 */
+#define XMTRDY          0x20
+#define DLAB            0x80
+#define TXR             0       /*  Transmit register (WRITE) */
+#define RXR             0       /*  Receive register  (READ)  */
+#define IER             1       /*  Interrupt Enable          */
+#define IIR             2       /*  Interrupt ID              */
+#define FCR             2       /*  FIFO control              */
+#define LCR             3       /*  Line control              */
+#define MCR             4       /*  Modem control             */
+#define LSR             5       /*  Line Status               */
+#define MSR             6       /*  Modem Status              */
+#define DLL             0       /*  Divisor Latch Low         */
+#define DLH             1       /*  Divisor latch High        */
+static int early_serial_putc(unsigned char ch)
+{
+        unsigned timeout = 0xffff;
+        while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+                cpu_relax();
+        outb(ch, early_serial_base + TXR);
+        return timeout ? 0 : -1;
+}
+static void early_serial_write(struct console *con, const char *s, unsigned n)
+{
+        while (*s && n-- > 0) {
+                if (*s == '\n')
+                        early_serial_putc('\r');
+                early_serial_putc(*s);
+                s++;
+        }
+}
+#define DEFAULT_BAUD 9600
+static __init void early_serial_init(char *s)
+{
+        unsigned char c;
+        unsigned divisor;
+        unsigned baud = DEFAULT_BAUD;
+        char *e;
+        if (*s == ',')
+                ++s;
+        if (*s) {
+                unsigned port;
+                if (!strncmp(s,"0x",2)) {
+                        early_serial_base = simple_strtoul(s, &e, 16);
+                } else {
+                        static int bases[] = { 0x3f8, 0x2f8 };
+                        if (!strncmp(s,"ttyS",4))
+                                s += 4;
+                        port = simple_strtoul(s, &e, 10);
+                        if (port > 1 || s == e)
+                                port = 0;
+                        early_serial_base = bases[port];
+                }
+                s += strcspn(s, ",");
+                if (*s == ',')
+                        s++;
+        }
+        outb(0x3, early_serial_base + LCR);     /* 8n1 */
+        outb(0, early_serial_base + IER);       /* no interrupt */
+        outb(0, early_serial_base + FCR);       /* no fifo */
+        outb(0x3, early_serial_base + MCR);     /* DTR + RTS */
+        if (*s) {
+                baud = simple_strtoul(s, &e, 0);
+                if (baud == 0 || s == e)
+                        baud = DEFAULT_BAUD;
+        }
+        divisor = 115200 / baud;
+        c = inb(early_serial_base + LCR);
+        outb(c | DLAB, early_serial_base + LCR);
+        outb(divisor & 0xff, early_serial_base + DLL);
+        outb((divisor >> 8) & 0xff, early_serial_base + DLH);
+        outb(c & ~DLAB, early_serial_base + LCR);
+}
+static struct console early_serial_console = {
+        .name =         "earlyser",
+        .write =        early_serial_write,
+        .flags =        CON_PRINTBUFFER,
+        .index =        -1,
+};
+/* Console interface to a host file on AMD's SimNow! */
+static int simnow_fd;
+enum {
+        MAGIC1 = 0xBACCD00A,
+        MAGIC2 = 0xCA110000,
+        XOPEN = 5,
+        XWRITE = 4,
+};
+static noinline long simnow(long cmd, long a, long b, long c)
+{
+        long ret;
+        asm volatile("cpuid" :
+                     "=a" (ret) :
+                     "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
+        return ret;
+}
+static void __init simnow_init(char *str)
+{
+        char *fn = "klog";
+        if (*str == '=')
+                fn = ++str;
+        /* error ignored */
+        simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
+}
+static void simnow_write(struct console *con, const char *s, unsigned n)
+{
+        simnow(XWRITE, simnow_fd, (unsigned long)s, n);
+}
+static struct console simnow_console = {
+        .name =         "simnow",
+        .write =        simnow_write,
+        .flags =        CON_PRINTBUFFER,
+        .index =        -1,
+};
+/* Direct interface for emergencies */
+struct console *early_console = &early_vga_console;
+static int early_console_initialized = 0;
+void early_printk(const char *fmt, ...)
+{
+        char buf[512];
+        int n;
+        va_list ap;
+        va_start(ap,fmt);
+        n = vscnprintf(buf,512,fmt,ap);
+        early_console->write(early_console,buf,n);
+        va_end(ap);
+}
+static int __initdata keep_early;
+static int __init setup_early_printk(char *buf)
+{
+        if (!buf)
+                return 0;
+        if (early_console_initialized)
+                return 0;
+        early_console_initialized = 1;
+        if (strstr(buf, "keep"))
+                keep_early = 1;
+        if (!strncmp(buf, "serial", 6)) {
+                early_serial_init(buf + 6);
+                early_console = &early_serial_console;
+        } else if (!strncmp(buf, "ttyS", 4)) {
+                early_serial_init(buf);
+                early_console = &early_serial_console;
+        } else if (!strncmp(buf, "vga", 3)
+                   && SCREEN_INFO.orig_video_isVGA == 1) {
+                max_xpos = SCREEN_INFO.orig_video_cols;
+                max_ypos = SCREEN_INFO.orig_video_lines;
+                current_ypos = SCREEN_INFO.orig_y;
+                early_console = &early_vga_console;
+        } else if (!strncmp(buf, "simnow", 6)) {
+                simnow_init(buf + 6);
+                early_console = &simnow_console;
+                keep_early = 1;
+#ifdef CONFIG_HVC_XEN
+        } else if (!strncmp(buf, "xen", 3)) {
+                early_console = &xenboot_console;
+#endif
+        }
+        if (keep_early)
+                early_console->flags &= ~CON_BOOT;
+        else
+                early_console->flags |= CON_BOOT;
+        register_console(early_console);
+        return 0;
+}
+early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
new file mode 100644
index 000000000000..2452c6fbe992
--- /dev/null
+++ b/arch/x86/kernel/efi_32.c
@@ -0,0 +1,712 @@
+/*
+ * Extensible Firmware Interface
+ *
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2002 Hewlett-Packard Co.
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * All EFI Runtime Services are not implemented yet as EFI only
+ * supports physical mode addressing on SoftSDV. This is to be fixed
+ * in a future version.  --drummond 1999-07-20
+ *
+ * Implemented EFI runtime services and virtual mode calls.  --davidm
+ *
+ * Goutham Rao: <goutham.rao@intel.com>
+ *      Skip non-WB memory and ignore empty memory ranges.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/kexec.h>
+#include <asm/setup.h>
+#include <asm/io.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
+#define EFI_DEBUG       0
+#define PFX             "EFI: "
+extern efi_status_t asmlinkage efi_call_phys(void *, ...);
+struct efi efi;
+EXPORT_SYMBOL(efi);
+static struct efi efi_phys;
+struct efi_memory_map memmap;
+/*
+ * We require an early boot_ioremap mapping mechanism initially
+ */
+extern void * boot_ioremap(unsigned long, unsigned long);
+/*
+ * To make EFI call EFI runtime service in physical addressing mode we need
+ * prelog/epilog before/after the invocation to disable interrupt, to
+ * claim EFI runtime service handler exclusively and to duplicate a memory in
+ * low memory space say 0 - 3G.
+ */
+static unsigned long efi_rt_eflags;
+static DEFINE_SPINLOCK(efi_rt_lock);
+static pgd_t efi_bak_pg_dir_pointer[2];
+static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
+{
+        unsigned long cr4;
+        unsigned long temp;
+        struct Xgt_desc_struct gdt_descr;
+        spin_lock(&efi_rt_lock);
+        local_irq_save(efi_rt_eflags);
+        /*
+         * If I don't have PSE, I should just duplicate two entries in page
+         * directory. If I have PSE, I just need to duplicate one entry in
+         * page directory.
+         */
+        cr4 = read_cr4();
+        if (cr4 & X86_CR4_PSE) {
+                efi_bak_pg_dir_pointer[0].pgd =
+                    swapper_pg_dir[pgd_index(0)].pgd;
+                swapper_pg_dir[0].pgd =
+                    swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
+        } else {
+                efi_bak_pg_dir_pointer[0].pgd =
+                    swapper_pg_dir[pgd_index(0)].pgd;
+                efi_bak_pg_dir_pointer[1].pgd =
+                    swapper_pg_dir[pgd_index(0x400000)].pgd;
+                swapper_pg_dir[pgd_index(0)].pgd =
+                    swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
+                temp = PAGE_OFFSET + 0x400000;
+                swapper_pg_dir[pgd_index(0x400000)].pgd =
+                    swapper_pg_dir[pgd_index(temp)].pgd;
+        }
+        /*
+         * After the lock is released, the original page table is restored.
+         */
+        local_flush_tlb();
+        gdt_descr.address = __pa(get_cpu_gdt_table(0));
+        gdt_descr.size = GDT_SIZE - 1;
+        load_gdt(&gdt_descr);
+}
+static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
+{
+        unsigned long cr4;
+        struct Xgt_desc_struct gdt_descr;
+        gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
+        gdt_descr.size = GDT_SIZE - 1;
+        load_gdt(&gdt_descr);
+        cr4 = read_cr4();
+        if (cr4 & X86_CR4_PSE) {
+                swapper_pg_dir[pgd_index(0)].pgd =
+                    efi_bak_pg_dir_pointer[0].pgd;
+        } else {
+                swapper_pg_dir[pgd_index(0)].pgd =
+                    efi_bak_pg_dir_pointer[0].pgd;
+                swapper_pg_dir[pgd_index(0x400000)].pgd =
+                    efi_bak_pg_dir_pointer[1].pgd;
+        }
+        /*
+         * After the lock is released, the original page table is restored.
+         */
+        local_flush_tlb();
+        local_irq_restore(efi_rt_eflags);
+        spin_unlock(&efi_rt_lock);
+}
+static efi_status_t
+phys_efi_set_virtual_address_map(unsigned long memory_map_size,
+                                 unsigned long descriptor_size,
+                                 u32 descriptor_version,
+                                 efi_memory_desc_t *virtual_map)
+{
+        efi_status_t status;
+        efi_call_phys_prelog();
+        status = efi_call_phys(efi_phys.set_virtual_address_map,
+                                     memory_map_size, descriptor_size,
+                                     descriptor_version, virtual_map);
+        efi_call_phys_epilog();
+        return status;
+}
+static efi_status_t
+phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+        efi_status_t status;
+        efi_call_phys_prelog();
+        status = efi_call_phys(efi_phys.get_time, tm, tc);
+        efi_call_phys_epilog();
+        return status;
+}
+inline int efi_set_rtc_mmss(unsigned long nowtime)
+{
+        int real_seconds, real_minutes;
+        efi_status_t    status;
+        efi_time_t      eft;
+        efi_time_cap_t  cap;
+        spin_lock(&efi_rt_lock);
+        status = efi.get_time(&eft, &cap);
+        spin_unlock(&efi_rt_lock);
+        if (status != EFI_SUCCESS)
+                panic("Ooops, efitime: can't read time!\n");
+        real_seconds = nowtime % 60;
+        real_minutes = nowtime / 60;
+        if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
+                real_minutes += 30;
+        real_minutes %= 60;
+        eft.minute = real_minutes;
+        eft.second = real_seconds;
+        if (status != EFI_SUCCESS) {
+                printk("Ooops: efitime: can't read time!\n");
+                return -1;
+        }
+        return 0;
+}
+/*
+ * This is used during kernel init before runtime
+ * services have been remapped and also during suspend, therefore,
+ * we'll need to call both in physical and virtual modes.
+ */
+inline unsigned long efi_get_time(void)
+{
+        efi_status_t status;
+        efi_time_t eft;
+        efi_time_cap_t cap;
+        if (efi.get_time) {
+                /* if we are in virtual mode use remapped function */
+                status = efi.get_time(&eft, &cap);
+        } else {
+                /* we are in physical mode */
+                status = phys_efi_get_time(&eft, &cap);
+        }
+        if (status != EFI_SUCCESS)
+                printk("Oops: efitime: can't read time status: 0x%lx\n",status);
+        return mktime(eft.year, eft.month, eft.day, eft.hour,
+                        eft.minute, eft.second);
+}
+int is_available_memory(efi_memory_desc_t * md)
+{
+        if (!(md->attribute & EFI_MEMORY_WB))
+                return 0;
+        switch (md->type) {
+                case EFI_LOADER_CODE:
+                case EFI_LOADER_DATA:
+                case EFI_BOOT_SERVICES_CODE:
+                case EFI_BOOT_SERVICES_DATA:
+                case EFI_CONVENTIONAL_MEMORY:
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * We need to map the EFI memory map again after paging_init().
+ */
+void __init efi_map_memmap(void)
+{
+        memmap.map = NULL;
+        memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
+                        (memmap.nr_map * memmap.desc_size));
+        if (memmap.map == NULL)
+                printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
+        memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+}
+#if EFI_DEBUG
+static void __init print_efi_memmap(void)
+{
+        efi_memory_desc_t *md;
+        void *p;
+        int i;
+        for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
+                md = p;
+                printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
+                        "range=[0x%016llx-0x%016llx) (%lluMB)\n",
+                        i, md->type, md->attribute, md->phys_addr,
+                        md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
+                        (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
+        }
+}
+#endif  /*  EFI_DEBUG  */
+/*
+ * Walks the EFI memory map and calls CALLBACK once for each EFI
+ * memory descriptor that has memory that is available for kernel use.
+ */
+void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
+{
+        int prev_valid = 0;
+        struct range {
+                unsigned long start;
+                unsigned long end;
+        } uninitialized_var(prev), curr;
+        efi_memory_desc_t *md;
+        unsigned long start, end;
+        void *p;
+        for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+                md = p;
+                if ((md->num_pages == 0) || (!is_available_memory(md)))
+                        continue;
+                curr.start = md->phys_addr;
+                curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
+                if (!prev_valid) {
+                        prev = curr;
+                        prev_valid = 1;
+                } else {
+                        if (curr.start < prev.start)
+                                printk(KERN_INFO PFX "Unordered memory map\n");
+                        if (prev.end == curr.start)
+                                prev.end = curr.end;
+                        else {
+                                start =
+                                    (unsigned long) (PAGE_ALIGN(prev.start));
+                                end = (unsigned long) (prev.end & PAGE_MASK);
+                                if ((end > start)
+                                    && (*callback) (start, end, arg) < 0)
+                                        return;
+                                prev = curr;
+                        }
+                }
+        }
+        if (prev_valid) {
+                start = (unsigned long) PAGE_ALIGN(prev.start);
+                end = (unsigned long) (prev.end & PAGE_MASK);
+                if (end > start)
+                        (*callback) (start, end, arg);
+        }
+}
+void __init efi_init(void)
+{
+        efi_config_table_t *config_tables;
+        efi_runtime_services_t *runtime;
+        efi_char16_t *c16;
+        char vendor[100] = "unknown";
+        unsigned long num_config_tables;
+        int i = 0;
+        memset(&efi, 0, sizeof(efi) );
+        memset(&efi_phys, 0, sizeof(efi_phys));
+        efi_phys.systab = EFI_SYSTAB;
+        memmap.phys_map = EFI_MEMMAP;
+        memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE;
+        memmap.desc_version = EFI_MEMDESC_VERSION;
+        memmap.desc_size = EFI_MEMDESC_SIZE;
+        efi.systab = (efi_system_table_t *)
+                boot_ioremap((unsigned long) efi_phys.systab,
+                        sizeof(efi_system_table_t));
+        /*
+         * Verify the EFI Table
+         */
+        if (efi.systab == NULL)
+                printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n");
+        if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+                printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n");
+        if ((efi.systab->hdr.revision >> 16) == 0)
+                printk(KERN_ERR PFX "Warning: EFI system table version "
+                       "%d.%02d, expected 1.00 or greater\n",
+                       efi.systab->hdr.revision >> 16,
+                       efi.systab->hdr.revision & 0xffff);
+        /*
+         * Grab some details from the system table
+         */
+        num_config_tables = efi.systab->nr_tables;
+        config_tables = (efi_config_table_t *)efi.systab->tables;
+        runtime = efi.systab->runtime;
+        /*
+         * Show what we know for posterity
+         */
+        c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2);
+        if (c16) {
+                for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i)
+                        vendor[i] = *c16++;
+                vendor[i] = '\0';
+        } else
+                printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
+        printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n",
+               efi.systab->hdr.revision >> 16,
+               efi.systab->hdr.revision & 0xffff, vendor);
+        /*
+         * Let's see what config tables the firmware passed to us.
+         */
+        config_tables = (efi_config_table_t *)
+                                boot_ioremap((unsigned long) config_tables,
+                                num_config_tables * sizeof(efi_config_table_t));
+        if (config_tables == NULL)
+                printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
+        efi.mps        = EFI_INVALID_TABLE_ADDR;
+        efi.acpi       = EFI_INVALID_TABLE_ADDR;
+        efi.acpi20     = EFI_INVALID_TABLE_ADDR;
+        efi.smbios     = EFI_INVALID_TABLE_ADDR;
+        efi.sal_systab = EFI_INVALID_TABLE_ADDR;
+        efi.boot_info  = EFI_INVALID_TABLE_ADDR;
+        efi.hcdp       = EFI_INVALID_TABLE_ADDR;
+        efi.uga        = EFI_INVALID_TABLE_ADDR;
+        for (i = 0; i < num_config_tables; i++) {
+                if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
+                        efi.mps = config_tables[i].table;
+                        printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
+                } else
+                    if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
+                        efi.acpi20 = config_tables[i].table;
+                        printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
+                } else
+                    if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
+                        efi.acpi = config_tables[i].table;
+                        printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
+                } else
+                    if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
+                        efi.smbios = config_tables[i].table;
+                        printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
+                } else
+                    if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
+                        efi.hcdp = config_tables[i].table;
+                        printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
+                } else
+                    if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
+                        efi.uga = config_tables[i].table;
+                        printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
+                }
+        }
+        printk("\n");
+        /*
+         * Check out the runtime services table. We need to map
+         * the runtime services table so that we can grab the physical
+         * address of several of the EFI runtime functions, needed to
+         * set the firmware into virtual mode.
+         */
+        runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long)
+                                                runtime,
+                                                sizeof(efi_runtime_services_t));
+        if (runtime != NULL) {
+                /*
+                 * We will only need *early* access to the following
+                 * two EFI runtime services before set_virtual_address_map
+                 * is invoked.
+                 */
+                efi_phys.get_time = (efi_get_time_t *) runtime->get_time;
+                efi_phys.set_virtual_address_map =
+                        (efi_set_virtual_address_map_t *)
+                                runtime->set_virtual_address_map;
+        } else
+                printk(KERN_ERR PFX "Could not map the runtime service table!\n");
+        /* Map the EFI memory map for use until paging_init() */
+        memmap.map = boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE);
+        if (memmap.map == NULL)
+                printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
+        memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+#if EFI_DEBUG
+        print_efi_memmap();
+#endif
+}
+static inline void __init check_range_for_systab(efi_memory_desc_t *md)
+{
+        if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
+                ((unsigned long)efi_phys.systab < md->phys_addr +
+                ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
+                unsigned long addr;
+                addr = md->virt_addr - md->phys_addr +
+                        (unsigned long)efi_phys.systab;
+                efi.systab = (efi_system_table_t *)addr;
+        }
+}
+/*
+ * Wrap all the virtual calls in a way that forces the parameters on the stack.
+ */
+#define efi_call_virt(f, args...) \
+     ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
+static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+        return efi_call_virt(get_time, tm, tc);
+}
+static efi_status_t virt_efi_set_time (efi_time_t *tm)
+{
+        return efi_call_virt(set_time, tm);
+}
+static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled,
+                                              efi_bool_t *pending,
+                                              efi_time_t *tm)
+{
+        return efi_call_virt(get_wakeup_time, enabled, pending, tm);
+}
+static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled,
+                                              efi_time_t *tm)
+{
+        return efi_call_virt(set_wakeup_time, enabled, tm);
+}
+static efi_status_t virt_efi_get_variable (efi_char16_t *name,
+                                           efi_guid_t *vendor, u32 *attr,
+                                           unsigned long *data_size, void *data)
+{
+        return efi_call_virt(get_variable, name, vendor, attr, data_size, data);
+}
+static efi_status_t virt_efi_get_next_variable (unsigned long *name_size,
+                                                efi_char16_t *name,
+                                                efi_guid_t *vendor)
+{
+        return efi_call_virt(get_next_variable, name_size, name, vendor);
+}
+static efi_status_t virt_efi_set_variable (efi_char16_t *name,
+                                           efi_guid_t *vendor,
+                                           unsigned long attr,
+                                           unsigned long data_size, void *data)
+{
+        return efi_call_virt(set_variable, name, vendor, attr, data_size, data);
+}
+static efi_status_t virt_efi_get_next_high_mono_count (u32 *count)
+{
+        return efi_call_virt(get_next_high_mono_count, count);
+}
+static void virt_efi_reset_system (int reset_type, efi_status_t status,
+                                   unsigned long data_size,
+                                   efi_char16_t *data)
+{
+        efi_call_virt(reset_system, reset_type, status, data_size, data);
+}
+/*
+ * This function will switch the EFI runtime services to virtual mode.
+ * Essentially, look through the EFI memmap and map every region that
+ * has the runtime attribute bit set in its memory descriptor and update
+ * that memory descriptor with the virtual address obtained from ioremap().
+ * This enables the runtime services to be called without having to
+ * thunk back into physical mode for every invocation.
+ */
+void __init efi_enter_virtual_mode(void)
+{
+        efi_memory_desc_t *md;
+        efi_status_t status;
+        void *p;
+        efi.systab = NULL;
+        for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+                md = p;
+                if (!(md->attribute & EFI_MEMORY_RUNTIME))
+                        continue;
+                md->virt_addr = (unsigned long)ioremap(md->phys_addr,
+                        md->num_pages << EFI_PAGE_SHIFT);
+                if (!(unsigned long)md->virt_addr) {
+                        printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
+                                (unsigned long)md->phys_addr);
+                }
+                /* update the virtual address of the EFI system table */
+                check_range_for_systab(md);
+        }
+        BUG_ON(!efi.systab);
+        status = phys_efi_set_virtual_address_map(
+                        memmap.desc_size * memmap.nr_map,
+                        memmap.desc_size,
+                        memmap.desc_version,
+                        memmap.phys_map);
+        if (status != EFI_SUCCESS) {
+                printk (KERN_ALERT "You are screwed! "
+                        "Unable to switch EFI into virtual mode "
+                        "(status=%lx)\n", status);
+                panic("EFI call to SetVirtualAddressMap() failed!");
+        }
+        /*
+         * Now that EFI is in virtual mode, update the function
+         * pointers in the runtime service table to the new virtual addresses.
+         */
+        efi.get_time = virt_efi_get_time;
+        efi.set_time = virt_efi_set_time;
+        efi.get_wakeup_time = virt_efi_get_wakeup_time;
+        efi.set_wakeup_time = virt_efi_set_wakeup_time;
+        efi.get_variable = virt_efi_get_variable;
+        efi.get_next_variable = virt_efi_get_next_variable;
+        efi.set_variable = virt_efi_set_variable;
+        efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
+        efi.reset_system = virt_efi_reset_system;
+}
+void __init
+efi_initialize_iomem_resources(struct resource *code_resource,
+                               struct resource *data_resource)
+{
+        struct resource *res;
+        efi_memory_desc_t *md;
+        void *p;
+        for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+                md = p;
+                if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
+                    0x100000000ULL)
+                        continue;
+                res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
+                switch (md->type) {
+                case EFI_RESERVED_TYPE:
+                        res->name = "Reserved Memory";
+                        break;
+                case EFI_LOADER_CODE:
+                        res->name = "Loader Code";
+                        break;
+                case EFI_LOADER_DATA:
+                        res->name = "Loader Data";
+                        break;
+                case EFI_BOOT_SERVICES_DATA:
+                        res->name = "BootServices Data";
+                        break;
+                case EFI_BOOT_SERVICES_CODE:
+                        res->name = "BootServices Code";
+                        break;
+                case EFI_RUNTIME_SERVICES_CODE:
+                        res->name = "Runtime Service Code";
+                        break;
+                case EFI_RUNTIME_SERVICES_DATA:
+                        res->name = "Runtime Service Data";
+                        break;
+                case EFI_CONVENTIONAL_MEMORY:
+                        res->name = "Conventional Memory";
+                        break;
+                case EFI_UNUSABLE_MEMORY:
+                        res->name = "Unusable Memory";
+                        break;
+                case EFI_ACPI_RECLAIM_MEMORY:
+                        res->name = "ACPI Reclaim";
+                        break;
+                case EFI_ACPI_MEMORY_NVS:
+                        res->name = "ACPI NVS";
+                        break;
+                case EFI_MEMORY_MAPPED_IO:
+                        res->name = "Memory Mapped IO";
+                        break;
+                case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
+                        res->name = "Memory Mapped IO Port Space";
+                        break;
+                default:
+                        res->name = "Reserved";
+                        break;
+                }
+                res->start = md->phys_addr;
+                res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
+                res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+                if (request_resource(&iomem_resource, res) < 0)
+                        printk(KERN_ERR PFX "Failed to allocate res %s : "
+                                "0x%llx-0x%llx\n", res->name,
+                                (unsigned long long)res->start,
+                                (unsigned long long)res->end);
+                /*
+                 * We don't know which region contains kernel data so we try
+                 * it repeatedly and let the resource manager test it.
+                 */
+                if (md->type == EFI_CONVENTIONAL_MEMORY) {
+                        request_resource(res, code_resource);
+                        request_resource(res, data_resource);
+#ifdef CONFIG_KEXEC
+                        request_resource(res, &crashk_res);
+#endif
+                }
+        }
+}
+/*
+ * Convenience functions to obtain memory types and attributes
+ */
+u32 efi_mem_type(unsigned long phys_addr)
+{
+        efi_memory_desc_t *md;
+        void *p;
+        for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+                md = p;
+                if ((md->phys_addr <= phys_addr) && (phys_addr <
+                        (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
+                        return md->type;
+        }
+        return 0;
+}
+u64 efi_mem_attributes(unsigned long phys_addr)
+{
+        efi_memory_desc_t *md;
+        void *p;
+        for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+                md = p;
+                if ((md->phys_addr <= phys_addr) && (phys_addr <
+                        (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
+                        return md->attribute;
+        }
+        return 0;
+}
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S
new file mode 100644
index 000000000000..ef00bb77d7e4
--- /dev/null
+++ b/arch/x86/kernel/efi_stub_32.S
@@ -0,0 +1,122 @@
+/*
+ * EFI call stub for IA32.
+ *
+ * This stub allows us to make EFI calls in physical mode with interrupts
+ * turned off.
+ */
+#include <linux/linkage.h>
+#include <asm/page.h>
+/*
+ * efi_call_phys(void *, ...) is a function with variable parameters.
+ * All the callers of this function assure that all the parameters are 4-bytes.
+ */
+/*
+ * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
+ * So we'd better save all of them at the beginning of this function and restore
+ * at the end no matter how many we use, because we can not assure EFI runtime
+ * service functions will comply with gcc calling convention, too.
+ */
+.text
+ENTRY(efi_call_phys)
+        /*
+         * 0. The function can only be called in Linux kernel. So CS has been
+         * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
+         * the values of these registers are the same. And, the corresponding
+         * GDT entries are identical. So I will do nothing about segment reg
+         * and GDT, but change GDT base register in prelog and epilog.
+         */
+        /*
+         * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
+         * But to make it smoothly switch from virtual mode to flat mode.
+         * The mapping of lower virtual memory has been created in prelog and
+         * epilog.
+         */
+        movl    $1f, %edx
+        subl    $__PAGE_OFFSET, %edx
+        jmp     *%edx
+1:
+        /*
+         * 2. Now on the top of stack is the return
+         * address in the caller of efi_call_phys(), then parameter 1,
+         * parameter 2, ..., param n. To make things easy, we save the return
+         * address of efi_call_phys in a global variable.
+         */
+        popl    %edx
+        movl    %edx, saved_return_addr
+        /* get the function pointer into ECX*/
+        popl    %ecx
+        movl    %ecx, efi_rt_function_ptr
+        movl    $2f, %edx
+        subl    $__PAGE_OFFSET, %edx
+        pushl   %edx
+        /*
+         * 3. Clear PG bit in %CR0.
+         */
+        movl    %cr0, %edx
+        andl    $0x7fffffff, %edx
+        movl    %edx, %cr0
+        jmp     1f
+1:
+        /*
+         * 4. Adjust stack pointer.
+         */
+        subl    $__PAGE_OFFSET, %esp
+        /*
+         * 5. Call the physical function.
+         */
+        jmp     *%ecx
+2:
+        /*
+         * 6. After EFI runtime service returns, control will return to
+         * following instruction. We'd better readjust stack pointer first.
+         */
+        addl    $__PAGE_OFFSET, %esp
+        /*
+         * 7. Restore PG bit
+         */
+        movl    %cr0, %edx
+        orl     $0x80000000, %edx
+        movl    %edx, %cr0
+        jmp     1f
+1:
+        /*
+         * 8. Now restore the virtual mode from flat mode by
+         * adding EIP with PAGE_OFFSET.
+         */
+        movl    $1f, %edx
+        jmp     *%edx
+1:
+        /*
+         * 9. Balance the stack. And because EAX contain the return value,
+         * we'd better not clobber it.
+         */
+        leal    efi_rt_function_ptr, %edx
+        movl    (%edx), %ecx
+        pushl   %ecx
+        /*
+         * 10. Push the saved return address onto the stack and return.
+         */
+        leal    saved_return_addr, %edx
+        movl    (%edx), %ecx
+        pushl   %ecx
+        ret
+.previous
+.data
+saved_return_addr:
+        .long 0
+efi_rt_function_ptr:
+        .long 0
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
new file mode 100644
index 000000000000..290b7bc82da3
--- /dev/null
+++ b/arch/x86/kernel/entry_32.S
@@ -0,0 +1,1112 @@
+/*
+ *  linux/arch/i386/entry.S
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ * This also contains the timer-interrupt handler, as well as all interrupts
+ * and faults that can result in a task-switch.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after a timer-interrupt and after each system call.
+ *
+ * I changed all the .align's to 4 (16 byte alignment), as that's faster
+ * on a 486.
+ *
+ * Stack layout in 'syscall_exit':
+ *      ptrace needs to have all regs on the stack.
+ *      if the order here is changed, it needs to be
+ *      updated in fork.c:copy_process, signal.c:do_signal,
+ *      ptrace.c and ptrace.h
+ *
+ *       0(%esp) - %ebx
+ *       4(%esp) - %ecx
+ *       8(%esp) - %edx
+ *       C(%esp) - %esi
+ *      10(%esp) - %edi
+ *      14(%esp) - %ebp
+ *      18(%esp) - %eax
+ *      1C(%esp) - %ds
+ *      20(%esp) - %es
+ *      24(%esp) - %fs
+ *      28(%esp) - orig_eax
+ *      2C(%esp) - %eip
+ *      30(%esp) - %cs
+ *      34(%esp) - %eflags
+ *      38(%esp) - %oldesp
+ *      3C(%esp) - %oldss
+ *
+ * "current" is in register %ebx during any slow entries.
+ */
+#include <linux/linkage.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/page.h>
+#include <asm/desc.h>
+#include <asm/percpu.h>
+#include <asm/dwarf2.h>
+#include "irq_vectors.h"
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization.  The following will never clobber any registers:
+ *   INTERRUPT_RETURN (aka. "iret")
+ *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+#define nr_syscalls ((syscall_table_size)/4)
+CF_MASK         = 0x00000001
+TF_MASK         = 0x00000100
+IF_MASK         = 0x00000200
+DF_MASK         = 0x00000400 
+NT_MASK         = 0x00004000
+VM_MASK         = 0x00020000
+#ifdef CONFIG_PREEMPT
+#define preempt_stop(clobbers)  DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
+#else
+#define preempt_stop(clobbers)
+#define resume_kernel           restore_nocheck
+#endif
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+        testl $IF_MASK,PT_EFLAGS(%esp)     # interrupts off?
+        jz 1f
+        TRACE_IRQS_ON
+1:
+#endif
+.endm
+#ifdef CONFIG_VM86
+#define resume_userspace_sig    check_userspace
+#else
+#define resume_userspace_sig    resume_userspace
+#endif
+#define SAVE_ALL \
+        cld; \
+        pushl %fs; \
+        CFI_ADJUST_CFA_OFFSET 4;\
+        /*CFI_REL_OFFSET fs, 0;*/\
+        pushl %es; \
+        CFI_ADJUST_CFA_OFFSET 4;\
+        /*CFI_REL_OFFSET es, 0;*/\
+        pushl %ds; \
+        CFI_ADJUST_CFA_OFFSET 4;\
+        /*CFI_REL_OFFSET ds, 0;*/\
+        pushl %eax; \
+        CFI_ADJUST_CFA_OFFSET 4;\
+        CFI_REL_OFFSET eax, 0;\
+        pushl %ebp; \
+        CFI_ADJUST_CFA_OFFSET 4;\
+        CFI_REL_OFFSET ebp, 0;\
+        pushl %edi; \
+        CFI_ADJUST_CFA_OFFSET 4;\
+        CFI_REL_OFFSET edi, 0;\
+        pushl %esi; \
+        CFI_ADJUST_CFA_OFFSET 4;\
+        CFI_REL_OFFSET esi, 0;\
+        pushl %edx; \
+        CFI_ADJUST_CFA_OFFSET 4;\
+        CFI_REL_OFFSET edx, 0;\
+        pushl %ecx; \
+        CFI_ADJUST_CFA_OFFSET 4;\
+        CFI_REL_OFFSET ecx, 0;\
+        pushl %ebx; \
+        CFI_ADJUST_CFA_OFFSET 4;\
+        CFI_REL_OFFSET ebx, 0;\
+        movl $(__USER_DS), %edx; \
+        movl %edx, %ds; \
+        movl %edx, %es; \
+        movl $(__KERNEL_PERCPU), %edx; \
+        movl %edx, %fs
+#define RESTORE_INT_REGS \
+        popl %ebx;      \
+        CFI_ADJUST_CFA_OFFSET -4;\
+        CFI_RESTORE ebx;\
+        popl %ecx;      \
+        CFI_ADJUST_CFA_OFFSET -4;\
+        CFI_RESTORE ecx;\
+        popl %edx;      \
+        CFI_ADJUST_CFA_OFFSET -4;\
+        CFI_RESTORE edx;\
+        popl %esi;      \
+        CFI_ADJUST_CFA_OFFSET -4;\
+        CFI_RESTORE esi;\
+        popl %edi;      \
+        CFI_ADJUST_CFA_OFFSET -4;\
+        CFI_RESTORE edi;\
+        popl %ebp;      \
+        CFI_ADJUST_CFA_OFFSET -4;\
+        CFI_RESTORE ebp;\
+        popl %eax;      \
+        CFI_ADJUST_CFA_OFFSET -4;\
+        CFI_RESTORE eax
+#define RESTORE_REGS    \
+        RESTORE_INT_REGS; \
+1:      popl %ds;       \
+        CFI_ADJUST_CFA_OFFSET -4;\
+        /*CFI_RESTORE ds;*/\
+2:      popl %es;       \
+        CFI_ADJUST_CFA_OFFSET -4;\
+        /*CFI_RESTORE es;*/\
+3:      popl %fs;       \
+        CFI_ADJUST_CFA_OFFSET -4;\
+        /*CFI_RESTORE fs;*/\
+.pushsection .fixup,"ax";       \
+4:      movl $0,(%esp); \
+        jmp 1b;         \
+5:      movl $0,(%esp); \
+        jmp 2b;         \
+6:      movl $0,(%esp); \
+        jmp 3b;         \
+.section __ex_table,"a";\
+        .align 4;       \
+        .long 1b,4b;    \
+        .long 2b,5b;    \
+        .long 3b,6b;    \
+.popsection
+#define RING0_INT_FRAME \
+        CFI_STARTPROC simple;\
+        CFI_SIGNAL_FRAME;\
+        CFI_DEF_CFA esp, 3*4;\
+        /*CFI_OFFSET cs, -2*4;*/\
+        CFI_OFFSET eip, -3*4
+#define RING0_EC_FRAME \
+        CFI_STARTPROC simple;\
+        CFI_SIGNAL_FRAME;\
+        CFI_DEF_CFA esp, 4*4;\
+        /*CFI_OFFSET cs, -2*4;*/\
+        CFI_OFFSET eip, -3*4
+#define RING0_PTREGS_FRAME \
+        CFI_STARTPROC simple;\
+        CFI_SIGNAL_FRAME;\
+        CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
+        /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
+        CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
+        /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
+        /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
+        CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
+        CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
+        CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
+        CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
+        CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
+        CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
+        CFI_OFFSET ebx, PT_EBX-PT_OLDESP
+ENTRY(ret_from_fork)
+        CFI_STARTPROC
+        pushl %eax
+        CFI_ADJUST_CFA_OFFSET 4
+        call schedule_tail
+        GET_THREAD_INFO(%ebp)
+        popl %eax
+        CFI_ADJUST_CFA_OFFSET -4
+        pushl $0x0202                   # Reset kernel eflags
+        CFI_ADJUST_CFA_OFFSET 4
+        popfl
+        CFI_ADJUST_CFA_OFFSET -4
+        jmp syscall_exit
+        CFI_ENDPROC
+END(ret_from_fork)
+/*
+ * Return to user mode is not as complex as all this looks,
+ * but we want the default path for a system call return to
+ * go as quickly as possible which is why some of this is
+ * less clear than it otherwise should be.
+ */
+        # userspace resumption stub bypassing syscall exit tracing
+        ALIGN
+        RING0_PTREGS_FRAME
+ret_from_exception:
+        preempt_stop(CLBR_ANY)
+ret_from_intr:
+        GET_THREAD_INFO(%ebp)
+check_userspace:
+        movl PT_EFLAGS(%esp), %eax      # mix EFLAGS and CS
+        movb PT_CS(%esp), %al
+        andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
+        cmpl $USER_RPL, %eax
+        jb resume_kernel                # not returning to v8086 or userspace
+ENTRY(resume_userspace)
+        DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
+                                        # setting need_resched or sigpending
+                                        # between sampling and the iret
+        movl TI_flags(%ebp), %ecx
+        andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
+                                        # int/exception return?
+        jne work_pending
+        jmp restore_all
+END(ret_from_exception)
+#ifdef CONFIG_PREEMPT
+ENTRY(resume_kernel)
+        DISABLE_INTERRUPTS(CLBR_ANY)
+        cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
+        jnz restore_nocheck
+need_resched:
+        movl TI_flags(%ebp), %ecx       # need_resched set ?
+        testb $_TIF_NEED_RESCHED, %cl
+        jz restore_all
+        testl $IF_MASK,PT_EFLAGS(%esp)  # interrupts off (exception path) ?
+        jz restore_all
+        call preempt_schedule_irq
+        jmp need_resched
+END(resume_kernel)
+#endif
+        CFI_ENDPROC
+/* SYSENTER_RETURN points to after the "sysenter" instruction in
+   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
+        # sysenter call handler stub
+ENTRY(sysenter_entry)
+        CFI_STARTPROC simple
+        CFI_SIGNAL_FRAME
+        CFI_DEF_CFA esp, 0
+        CFI_REGISTER esp, ebp
+        movl TSS_sysenter_esp0(%esp),%esp
+sysenter_past_esp:
+        /*
+         * No need to follow this irqs on/off section: the syscall
+         * disabled irqs and here we enable it straight after entry:
+         */
+        ENABLE_INTERRUPTS(CLBR_NONE)
+        pushl $(__USER_DS)
+        CFI_ADJUST_CFA_OFFSET 4
+        /*CFI_REL_OFFSET ss, 0*/
+        pushl %ebp
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET esp, 0
+        pushfl
+        CFI_ADJUST_CFA_OFFSET 4
+        pushl $(__USER_CS)
+        CFI_ADJUST_CFA_OFFSET 4
+        /*CFI_REL_OFFSET cs, 0*/
+        /*
+         * Push current_thread_info()->sysenter_return to the stack.
+         * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
+         * pushed above; +8 corresponds to copy_thread's esp0 setting.
+         */
+        pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET eip, 0
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+        cmpl $__PAGE_OFFSET-3,%ebp
+        jae syscall_fault
+1:      movl (%ebp),%ebp
+.section __ex_table,"a"
+        .align 4
+        .long 1b,syscall_fault
+.previous
+        pushl %eax
+        CFI_ADJUST_CFA_OFFSET 4
+        SAVE_ALL
+        GET_THREAD_INFO(%ebp)
+        /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+        testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+        jnz syscall_trace_entry
+        cmpl $(nr_syscalls), %eax
+        jae syscall_badsys
+        call *sys_call_table(,%eax,4)
+        movl %eax,PT_EAX(%esp)
+        DISABLE_INTERRUPTS(CLBR_ANY)
+        TRACE_IRQS_OFF
+        movl TI_flags(%ebp), %ecx
+        testw $_TIF_ALLWORK_MASK, %cx
+        jne syscall_exit_work
+/* if something modifies registers it must also disable sysexit */
+        movl PT_EIP(%esp), %edx
+        movl PT_OLDESP(%esp), %ecx
+        xorl %ebp,%ebp
+        TRACE_IRQS_ON
+1:      mov  PT_FS(%esp), %fs
+        ENABLE_INTERRUPTS_SYSEXIT
+        CFI_ENDPROC
+.pushsection .fixup,"ax"
+2:      movl $0,PT_FS(%esp)
+        jmp 1b
+.section __ex_table,"a"
+        .align 4
+        .long 1b,2b
+.popsection
+ENDPROC(sysenter_entry)
+        # system call handler stub
+ENTRY(system_call)
+        RING0_INT_FRAME                 # can't unwind into user space anyway
+        pushl %eax                      # save orig_eax
+        CFI_ADJUST_CFA_OFFSET 4
+        SAVE_ALL
+        GET_THREAD_INFO(%ebp)
+                                        # system call tracing in operation / emulation
+        /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
+        testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+        jnz syscall_trace_entry
+        cmpl $(nr_syscalls), %eax
+        jae syscall_badsys
+syscall_call:
+        call *sys_call_table(,%eax,4)
+        movl %eax,PT_EAX(%esp)          # store the return value
+syscall_exit:
+        DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
+                                        # setting need_resched or sigpending
+                                        # between sampling and the iret
+        TRACE_IRQS_OFF
+        testl $TF_MASK,PT_EFLAGS(%esp)  # If tracing set singlestep flag on exit
+        jz no_singlestep
+        orl $_TIF_SINGLESTEP,TI_flags(%ebp)
+no_singlestep:
+        movl TI_flags(%ebp), %ecx
+        testw $_TIF_ALLWORK_MASK, %cx   # current->work
+        jne syscall_exit_work
+restore_all:
+        movl PT_EFLAGS(%esp), %eax      # mix EFLAGS, SS and CS
+        # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+        # are returning to the kernel.
+        # See comments in process.c:copy_thread() for details.
+        movb PT_OLDSS(%esp), %ah
+        movb PT_CS(%esp), %al
+        andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+        cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+        CFI_REMEMBER_STATE
+        je ldt_ss                       # returning to user-space with LDT SS
+restore_nocheck:
+        TRACE_IRQS_IRET
+restore_nocheck_notrace:
+        RESTORE_REGS
+        addl $4, %esp                   # skip orig_eax/error_code
+        CFI_ADJUST_CFA_OFFSET -4
+1:      INTERRUPT_RETURN
+.section .fixup,"ax"
+iret_exc:
+        pushl $0                        # no error code
+        pushl $do_iret_error
+        jmp error_code
+.previous
+.section __ex_table,"a"
+        .align 4
+        .long 1b,iret_exc
+.previous
+        CFI_RESTORE_STATE
+ldt_ss:
+        larl PT_OLDSS(%esp), %eax
+        jnz restore_nocheck
+        testl $0x00400000, %eax         # returning to 32bit stack?
+        jnz restore_nocheck             # allright, normal return
+#ifdef CONFIG_PARAVIRT
+        /*
+         * The kernel can't run on a non-flat stack if paravirt mode
+         * is active.  Rather than try to fixup the high bits of
+         * ESP, bypass this code entirely.  This may break DOSemu
+         * and/or Wine support in a paravirt VM, although the option
+         * is still available to implement the setting of the high
+         * 16-bits in the INTERRUPT_RETURN paravirt-op.
+         */
+        cmpl $0, paravirt_ops+PARAVIRT_enabled
+        jne restore_nocheck
+#endif
+        /* If returning to userspace with 16bit stack,
+         * try to fix the higher word of ESP, as the CPU
+         * won't restore it.
+         * This is an "official" bug of all the x86-compatible
+         * CPUs, which we can try to work around to make
+         * dosemu and wine happy. */
+        movl PT_OLDESP(%esp), %eax
+        movl %esp, %edx
+        call patch_espfix_desc
+        pushl $__ESPFIX_SS
+        CFI_ADJUST_CFA_OFFSET 4
+        pushl %eax
+        CFI_ADJUST_CFA_OFFSET 4
+        DISABLE_INTERRUPTS(CLBR_EAX)
+        TRACE_IRQS_OFF
+        lss (%esp), %esp
+        CFI_ADJUST_CFA_OFFSET -8
+        jmp restore_nocheck
+        CFI_ENDPROC
+ENDPROC(system_call)
+        # perform work that needs to be done immediately before resumption
+        ALIGN
+        RING0_PTREGS_FRAME              # can't unwind into user space anyway
+work_pending:
+        testb $_TIF_NEED_RESCHED, %cl
+        jz work_notifysig
+work_resched:
+        call schedule
+        DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
+                                        # setting need_resched or sigpending
+                                        # between sampling and the iret
+        TRACE_IRQS_OFF
+        movl TI_flags(%ebp), %ecx
+        andl $_TIF_WORK_MASK, %ecx      # is there any work to be done other
+                                        # than syscall tracing?
+        jz restore_all
+        testb $_TIF_NEED_RESCHED, %cl
+        jnz work_resched
+work_notifysig:                         # deal with pending signals and
+                                        # notify-resume requests
+#ifdef CONFIG_VM86
+        testl $VM_MASK, PT_EFLAGS(%esp)
+        movl %esp, %eax
+        jne work_notifysig_v86          # returning to kernel-space or
+                                        # vm86-space
+        xorl %edx, %edx
+        call do_notify_resume
+        jmp resume_userspace_sig
+        ALIGN
+work_notifysig_v86:
+        pushl %ecx                      # save ti_flags for do_notify_resume
+        CFI_ADJUST_CFA_OFFSET 4
+        call save_v86_state             # %eax contains pt_regs pointer
+        popl %ecx
+        CFI_ADJUST_CFA_OFFSET -4
+        movl %eax, %esp
+#else
+        movl %esp, %eax
+#endif
+        xorl %edx, %edx
+        call do_notify_resume
+        jmp resume_userspace_sig
+END(work_pending)
+        # perform syscall exit tracing
+        ALIGN
+syscall_trace_entry:
+        movl $-ENOSYS,PT_EAX(%esp)
+        movl %esp, %eax
+        xorl %edx,%edx
+        call do_syscall_trace
+        cmpl $0, %eax
+        jne resume_userspace            # ret != 0 -> running under PTRACE_SYSEMU,
+                                        # so must skip actual syscall
+        movl PT_ORIG_EAX(%esp), %eax
+        cmpl $(nr_syscalls), %eax
+        jnae syscall_call
+        jmp syscall_exit
+END(syscall_trace_entry)
+        # perform syscall exit tracing
+        ALIGN
+syscall_exit_work:
+        testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
+        jz work_pending
+        TRACE_IRQS_ON
+        ENABLE_INTERRUPTS(CLBR_ANY)     # could let do_syscall_trace() call
+                                        # schedule() instead
+        movl %esp, %eax
+        movl $1, %edx
+        call do_syscall_trace
+        jmp resume_userspace
+END(syscall_exit_work)
+        CFI_ENDPROC
+        RING0_INT_FRAME                 # can't unwind into user space anyway
+syscall_fault:
+        pushl %eax                      # save orig_eax
+        CFI_ADJUST_CFA_OFFSET 4
+        SAVE_ALL
+        GET_THREAD_INFO(%ebp)
+        movl $-EFAULT,PT_EAX(%esp)
+        jmp resume_userspace
+END(syscall_fault)
+syscall_badsys:
+        movl $-ENOSYS,PT_EAX(%esp)
+        jmp resume_userspace
+END(syscall_badsys)
+        CFI_ENDPROC
+#define FIXUP_ESPFIX_STACK \
+        /* since we are on a wrong stack, we cant make it a C code :( */ \
+        PER_CPU(gdt_page, %ebx); \
+        GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
+        addl %esp, %eax; \
+        pushl $__KERNEL_DS; \
+        CFI_ADJUST_CFA_OFFSET 4; \
+        pushl %eax; \
+        CFI_ADJUST_CFA_OFFSET 4; \
+        lss (%esp), %esp; \
+        CFI_ADJUST_CFA_OFFSET -8;
+#define UNWIND_ESPFIX_STACK \
+        movl %ss, %eax; \
+        /* see if on espfix stack */ \
+        cmpw $__ESPFIX_SS, %ax; \
+        jne 27f; \
+        movl $__KERNEL_DS, %eax; \
+        movl %eax, %ds; \
+        movl %eax, %es; \
+        /* switch to normal stack */ \
+        FIXUP_ESPFIX_STACK; \
+27:;
+/*
+ * Build the entry stubs and pointer table with
+ * some assembler magic.
+ */
+.data
+ENTRY(interrupt)
+.text
+ENTRY(irq_entries_start)
+        RING0_INT_FRAME
+vector=0
+.rept NR_IRQS
+        ALIGN
+ .if vector
+        CFI_ADJUST_CFA_OFFSET -4
+ .endif
+1:      pushl $~(vector)
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp common_interrupt
+ .previous
+        .long 1b
+ .text
+vector=vector+1
+.endr
+END(irq_entries_start)
+.previous
+END(interrupt)
+.previous
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
+        ALIGN
+common_interrupt:
+        SAVE_ALL
+        TRACE_IRQS_OFF
+        movl %esp,%eax
+        call do_IRQ
+        jmp ret_from_intr
+ENDPROC(common_interrupt)
+        CFI_ENDPROC
+#define BUILD_INTERRUPT(name, nr)       \
+ENTRY(name)                             \
+        RING0_INT_FRAME;                \
+        pushl $~(nr);                   \
+        CFI_ADJUST_CFA_OFFSET 4;        \
+        SAVE_ALL;                       \
+        TRACE_IRQS_OFF                  \
+        movl %esp,%eax;                 \
+        call smp_##name;                \
+        jmp ret_from_intr;              \
+        CFI_ENDPROC;                    \
+ENDPROC(name)
+/* The include is where all of the SMP etc. interrupts come from */
+#include "entry_arch.h"
+KPROBE_ENTRY(page_fault)
+        RING0_EC_FRAME
+        pushl $do_page_fault
+        CFI_ADJUST_CFA_OFFSET 4
+        ALIGN
+error_code:
+        /* the function address is in %fs's slot on the stack */
+        pushl %es
+        CFI_ADJUST_CFA_OFFSET 4
+        /*CFI_REL_OFFSET es, 0*/
+        pushl %ds
+        CFI_ADJUST_CFA_OFFSET 4
+        /*CFI_REL_OFFSET ds, 0*/
+        pushl %eax
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET eax, 0
+        pushl %ebp
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ebp, 0
+        pushl %edi
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET edi, 0
+        pushl %esi
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET esi, 0
+        pushl %edx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET edx, 0
+        pushl %ecx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ecx, 0
+        pushl %ebx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ebx, 0
+        cld
+        pushl %fs
+        CFI_ADJUST_CFA_OFFSET 4
+        /*CFI_REL_OFFSET fs, 0*/
+        movl $(__KERNEL_PERCPU), %ecx
+        movl %ecx, %fs
+        UNWIND_ESPFIX_STACK
+        popl %ecx
+        CFI_ADJUST_CFA_OFFSET -4
+        /*CFI_REGISTER es, ecx*/
+        movl PT_FS(%esp), %edi          # get the function address
+        movl PT_ORIG_EAX(%esp), %edx    # get the error code
+        movl $-1, PT_ORIG_EAX(%esp)     # no syscall to restart
+        mov  %ecx, PT_FS(%esp)
+        /*CFI_REL_OFFSET fs, ES*/
+        movl $(__USER_DS), %ecx
+        movl %ecx, %ds
+        movl %ecx, %es
+        movl %esp,%eax                  # pt_regs pointer
+        call *%edi
+        jmp ret_from_exception
+        CFI_ENDPROC
+KPROBE_END(page_fault)
+ENTRY(coprocessor_error)
+        RING0_INT_FRAME
+        pushl $0
+        CFI_ADJUST_CFA_OFFSET 4
+        pushl $do_coprocessor_error
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+END(coprocessor_error)
+ENTRY(simd_coprocessor_error)
+        RING0_INT_FRAME
+        pushl $0
+        CFI_ADJUST_CFA_OFFSET 4
+        pushl $do_simd_coprocessor_error
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+END(simd_coprocessor_error)
+ENTRY(device_not_available)
+        RING0_INT_FRAME
+        pushl $-1                       # mark this as an int
+        CFI_ADJUST_CFA_OFFSET 4
+        SAVE_ALL
+        GET_CR0_INTO_EAX
+        testl $0x4, %eax                # EM (math emulation bit)
+        jne device_not_available_emulate
+        preempt_stop(CLBR_ANY)
+        call math_state_restore
+        jmp ret_from_exception
+device_not_available_emulate:
+        pushl $0                        # temporary storage for ORIG_EIP
+        CFI_ADJUST_CFA_OFFSET 4
+        call math_emulate
+        addl $4, %esp
+        CFI_ADJUST_CFA_OFFSET -4
+        jmp ret_from_exception
+        CFI_ENDPROC
+END(device_not_available)
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+#define FIX_STACK(offset, ok, label)            \
+        cmpw $__KERNEL_CS,4(%esp);              \
+        jne ok;                                 \
+label:                                          \
+        movl TSS_sysenter_esp0+offset(%esp),%esp;       \
+        CFI_DEF_CFA esp, 0;                     \
+        CFI_UNDEFINED eip;                      \
+        pushfl;                                 \
+        CFI_ADJUST_CFA_OFFSET 4;                \
+        pushl $__KERNEL_CS;                     \
+        CFI_ADJUST_CFA_OFFSET 4;                \
+        pushl $sysenter_past_esp;               \
+        CFI_ADJUST_CFA_OFFSET 4;                \
+        CFI_REL_OFFSET eip, 0
+KPROBE_ENTRY(debug)
+        RING0_INT_FRAME
+        cmpl $sysenter_entry,(%esp)
+        jne debug_stack_correct
+        FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+debug_stack_correct:
+        pushl $-1                       # mark this as an int
+        CFI_ADJUST_CFA_OFFSET 4
+        SAVE_ALL
+        xorl %edx,%edx                  # error code 0
+        movl %esp,%eax                  # pt_regs pointer
+        call do_debug
+        jmp ret_from_exception
+        CFI_ENDPROC
+KPROBE_END(debug)
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got  an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+KPROBE_ENTRY(nmi)
+        RING0_INT_FRAME
+        pushl %eax
+        CFI_ADJUST_CFA_OFFSET 4
+        movl %ss, %eax
+        cmpw $__ESPFIX_SS, %ax
+        popl %eax
+        CFI_ADJUST_CFA_OFFSET -4
+        je nmi_espfix_stack
+        cmpl $sysenter_entry,(%esp)
+        je nmi_stack_fixup
+        pushl %eax
+        CFI_ADJUST_CFA_OFFSET 4
+        movl %esp,%eax
+        /* Do not access memory above the end of our stack page,
+         * it might not exist.
+         */
+        andl $(THREAD_SIZE-1),%eax
+        cmpl $(THREAD_SIZE-20),%eax
+        popl %eax
+        CFI_ADJUST_CFA_OFFSET -4
+        jae nmi_stack_correct
+        cmpl $sysenter_entry,12(%esp)
+        je nmi_debug_stack_check
+nmi_stack_correct:
+        /* We have a RING0_INT_FRAME here */
+        pushl %eax
+        CFI_ADJUST_CFA_OFFSET 4
+        SAVE_ALL
+        xorl %edx,%edx          # zero error code
+        movl %esp,%eax          # pt_regs pointer
+        call do_nmi
+        jmp restore_nocheck_notrace
+        CFI_ENDPROC
+nmi_stack_fixup:
+        RING0_INT_FRAME
+        FIX_STACK(12,nmi_stack_correct, 1)
+        jmp nmi_stack_correct
+nmi_debug_stack_check:
+        /* We have a RING0_INT_FRAME here */
+        cmpw $__KERNEL_CS,16(%esp)
+        jne nmi_stack_correct
+        cmpl $debug,(%esp)
+        jb nmi_stack_correct
+        cmpl $debug_esp_fix_insn,(%esp)
+        ja nmi_stack_correct
+        FIX_STACK(24,nmi_stack_correct, 1)
+        jmp nmi_stack_correct
+nmi_espfix_stack:
+        /* We have a RING0_INT_FRAME here.
+         *
+         * create the pointer to lss back
+         */
+        pushl %ss
+        CFI_ADJUST_CFA_OFFSET 4
+        pushl %esp
+        CFI_ADJUST_CFA_OFFSET 4
+        addw $4, (%esp)
+        /* copy the iret frame of 12 bytes */
+        .rept 3
+        pushl 16(%esp)
+        CFI_ADJUST_CFA_OFFSET 4
+        .endr
+        pushl %eax
+        CFI_ADJUST_CFA_OFFSET 4
+        SAVE_ALL
+        FIXUP_ESPFIX_STACK              # %eax == %esp
+        xorl %edx,%edx                  # zero error code
+        call do_nmi
+        RESTORE_REGS
+        lss 12+4(%esp), %esp            # back to espfix stack
+        CFI_ADJUST_CFA_OFFSET -24
+1:      INTERRUPT_RETURN
+        CFI_ENDPROC
+.section __ex_table,"a"
+        .align 4
+        .long 1b,iret_exc
+.previous
+KPROBE_END(nmi)
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+1:      iret
+.section __ex_table,"a"
+        .align 4
+        .long 1b,iret_exc
+.previous
+END(native_iret)
+ENTRY(native_irq_enable_sysexit)
+        sti
+        sysexit
+END(native_irq_enable_sysexit)
+#endif
+KPROBE_ENTRY(int3)
+        RING0_INT_FRAME
+        pushl $-1                       # mark this as an int
+        CFI_ADJUST_CFA_OFFSET 4
+        SAVE_ALL
+        xorl %edx,%edx          # zero error code
+        movl %esp,%eax          # pt_regs pointer
+        call do_int3
+        jmp ret_from_exception
+        CFI_ENDPROC
+KPROBE_END(int3)
+ENTRY(overflow)
+        RING0_INT_FRAME
+        pushl $0
+        CFI_ADJUST_CFA_OFFSET 4
+        pushl $do_overflow
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+END(overflow)
+ENTRY(bounds)
+        RING0_INT_FRAME
+        pushl $0
+        CFI_ADJUST_CFA_OFFSET 4
+        pushl $do_bounds
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+END(bounds)
+ENTRY(invalid_op)
+        RING0_INT_FRAME
+        pushl $0
+        CFI_ADJUST_CFA_OFFSET 4
+        pushl $do_invalid_op
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+END(invalid_op)
+ENTRY(coprocessor_segment_overrun)
+        RING0_INT_FRAME
+        pushl $0
+        CFI_ADJUST_CFA_OFFSET 4
+        pushl $do_coprocessor_segment_overrun
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+END(coprocessor_segment_overrun)
+ENTRY(invalid_TSS)
+        RING0_EC_FRAME
+        pushl $do_invalid_TSS
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+END(invalid_TSS)
+ENTRY(segment_not_present)
+        RING0_EC_FRAME
+        pushl $do_segment_not_present
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+END(segment_not_present)
+ENTRY(stack_segment)
+        RING0_EC_FRAME
+        pushl $do_stack_segment
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+END(stack_segment)
+KPROBE_ENTRY(general_protection)
+        RING0_EC_FRAME
+        pushl $do_general_protection
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+KPROBE_END(general_protection)
+ENTRY(alignment_check)
+        RING0_EC_FRAME
+        pushl $do_alignment_check
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+END(alignment_check)
+ENTRY(divide_error)
+        RING0_INT_FRAME
+        pushl $0                        # no error code
+        CFI_ADJUST_CFA_OFFSET 4
+        pushl $do_divide_error
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+END(divide_error)
+#ifdef CONFIG_X86_MCE
+ENTRY(machine_check)
+        RING0_INT_FRAME
+        pushl $0
+        CFI_ADJUST_CFA_OFFSET 4
+        pushl machine_check_vector
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+END(machine_check)
+#endif
+ENTRY(spurious_interrupt_bug)
+        RING0_INT_FRAME
+        pushl $0
+        CFI_ADJUST_CFA_OFFSET 4
+        pushl $do_spurious_interrupt_bug
+        CFI_ADJUST_CFA_OFFSET 4
+        jmp error_code
+        CFI_ENDPROC
+END(spurious_interrupt_bug)
+ENTRY(kernel_thread_helper)
+        pushl $0                # fake return address for unwinder
+        CFI_STARTPROC
+        movl %edx,%eax
+        push %edx
+        CFI_ADJUST_CFA_OFFSET 4
+        call *%ebx
+        push %eax
+        CFI_ADJUST_CFA_OFFSET 4
+        call do_exit
+        CFI_ENDPROC
+ENDPROC(kernel_thread_helper)
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+        CFI_STARTPROC
+        pushl $0
+        CFI_ADJUST_CFA_OFFSET 4
+        SAVE_ALL
+        TRACE_IRQS_OFF
+        /* Check to see if we got the event in the critical
+           region in xen_iret_direct, after we've reenabled
+           events and checked for pending events.  This simulates
+           iret instruction's behaviour where it delivers a
+           pending interrupt when enabling interrupts. */
+        movl PT_EIP(%esp),%eax
+        cmpl $xen_iret_start_crit,%eax
+        jb   1f
+        cmpl $xen_iret_end_crit,%eax
+        jae  1f
+        call xen_iret_crit_fixup
+1:      mov %esp, %eax
+        call xen_evtchn_do_upcall
+        jmp  ret_from_intr
+        CFI_ENDPROC
+ENDPROC(xen_hypervisor_callback)
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(xen_failsafe_callback)
+        CFI_STARTPROC
+        pushl %eax
+        CFI_ADJUST_CFA_OFFSET 4
+        movl $1,%eax
+1:      mov 4(%esp),%ds
+2:      mov 8(%esp),%es
+3:      mov 12(%esp),%fs
+4:      mov 16(%esp),%gs
+        testl %eax,%eax
+        popl %eax
+        CFI_ADJUST_CFA_OFFSET -4
+        lea 16(%esp),%esp
+        CFI_ADJUST_CFA_OFFSET -16
+        jz 5f
+        addl $16,%esp
+        jmp iret_exc            # EAX != 0 => Category 2 (Bad IRET)
+5:      pushl $0                # EAX == 0 => Category 1 (Bad segment)
+        CFI_ADJUST_CFA_OFFSET 4
+        SAVE_ALL
+        jmp ret_from_exception
+        CFI_ENDPROC
+.section .fixup,"ax"
+6:      xorl %eax,%eax
+        movl %eax,4(%esp)
+        jmp 1b
+7:      xorl %eax,%eax
+        movl %eax,8(%esp)
+        jmp 2b
+8:      xorl %eax,%eax
+        movl %eax,12(%esp)
+        jmp 3b
+9:      xorl %eax,%eax
+        movl %eax,16(%esp)
+        jmp 4b
+.previous
+.section __ex_table,"a"
+        .align 4
+        .long 1b,6b
+        .long 2b,7b
+        .long 3b,8b
+        .long 4b,9b
+.previous
+ENDPROC(xen_failsafe_callback)
+#endif  /* CONFIG_XEN */
+.section .rodata,"a"
+#include "syscall_table_32.S"
+syscall_table_size=(.-sys_call_table)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
new file mode 100644
index 000000000000..1d232e5f5658
--- /dev/null
+++ b/arch/x86/kernel/entry_64.S
@@ -0,0 +1,1172 @@
+/*
+ *  linux/arch/x86_64/entry.S
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
+ *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ */
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after an interrupt and after each system call.
+ * 
+ * Normal syscalls and interrupts don't save a full stack frame, this is 
+ * only done for syscall tracing, signals or fork/exec et.al.
+ * 
+ * A note on terminology:        
+ * - top of stack: Architecture defined interrupt frame from SS to RIP 
+ * at the top of the kernel process stack.      
+ * - partial stack frame: partially saved registers upto R11.
+ * - full stack frame: Like partial stack frame, but all register saved. 
+ *
+ * Some macro usage:
+ * - CFI macros are used to generate dwarf2 unwind information for better
+ * backtraces. They don't change any code.
+ * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
+ * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
+ * There are unfortunately lots of special cases where some registers
+ * not touched. The macro is a big mess that should be cleaned up.
+ * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
+ * Gives a full stack frame.
+ * - ENTRY/END Define functions in the symbol table.
+ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
+ * frame that is otherwise undefined after a SYSCALL
+ * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
+ * - errorentry/paranoidentry/zeroentry - Define exception entry points.
+ */
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/asm-offsets.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+#include <asm/page.h>
+#include <asm/irqflags.h>
+        .code64
+#ifndef CONFIG_PREEMPT
+#define retint_kernel retint_restore_args
+#endif  
+.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+#ifdef CONFIG_TRACE_IRQFLAGS
+        bt   $9,EFLAGS-\offset(%rsp)    /* interrupts off? */
+        jnc  1f
+        TRACE_IRQS_ON
+1:
+#endif
+.endm
+/*
+ * C code is not supposed to know about undefined top of stack. Every time 
+ * a C function with an pt_regs argument is called from the SYSCALL based 
+ * fast path FIXUP_TOP_OF_STACK is needed.
+ * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
+ * manipulation.
+ */             
+                
+        /* %rsp:at FRAMEEND */ 
+        .macro FIXUP_TOP_OF_STACK tmp
+        movq    %gs:pda_oldrsp,\tmp
+        movq    \tmp,RSP(%rsp)
+        movq    $__USER_DS,SS(%rsp)
+        movq    $__USER_CS,CS(%rsp)
+        movq    $-1,RCX(%rsp)
+        movq    R11(%rsp),\tmp  /* get eflags */
+        movq    \tmp,EFLAGS(%rsp)
+        .endm
+        .macro RESTORE_TOP_OF_STACK tmp,offset=0
+        movq   RSP-\offset(%rsp),\tmp
+        movq   \tmp,%gs:pda_oldrsp
+        movq   EFLAGS-\offset(%rsp),\tmp
+        movq   \tmp,R11-\offset(%rsp)
+        .endm
+        .macro FAKE_STACK_FRAME child_rip
+        /* push in order ss, rsp, eflags, cs, rip */
+        xorl %eax, %eax
+        pushq %rax /* ss */
+        CFI_ADJUST_CFA_OFFSET   8
+        /*CFI_REL_OFFSET        ss,0*/
+        pushq %rax /* rsp */
+        CFI_ADJUST_CFA_OFFSET   8
+        CFI_REL_OFFSET  rsp,0
+        pushq $(1<<9) /* eflags - interrupts on */
+        CFI_ADJUST_CFA_OFFSET   8
+        /*CFI_REL_OFFSET        rflags,0*/
+        pushq $__KERNEL_CS /* cs */
+        CFI_ADJUST_CFA_OFFSET   8
+        /*CFI_REL_OFFSET        cs,0*/
+        pushq \child_rip /* rip */
+        CFI_ADJUST_CFA_OFFSET   8
+        CFI_REL_OFFSET  rip,0
+        pushq   %rax /* orig rax */
+        CFI_ADJUST_CFA_OFFSET   8
+        .endm
+        .macro UNFAKE_STACK_FRAME
+        addq $8*6, %rsp
+        CFI_ADJUST_CFA_OFFSET   -(6*8)
+        .endm
+        .macro  CFI_DEFAULT_STACK start=1
+        .if \start
+        CFI_STARTPROC   simple
+        CFI_SIGNAL_FRAME
+        CFI_DEF_CFA     rsp,SS+8
+        .else
+        CFI_DEF_CFA_OFFSET SS+8
+        .endif
+        CFI_REL_OFFSET  r15,R15
+        CFI_REL_OFFSET  r14,R14
+        CFI_REL_OFFSET  r13,R13
+        CFI_REL_OFFSET  r12,R12
+        CFI_REL_OFFSET  rbp,RBP
+        CFI_REL_OFFSET  rbx,RBX
+        CFI_REL_OFFSET  r11,R11
+        CFI_REL_OFFSET  r10,R10
+        CFI_REL_OFFSET  r9,R9
+        CFI_REL_OFFSET  r8,R8
+        CFI_REL_OFFSET  rax,RAX
+        CFI_REL_OFFSET  rcx,RCX
+        CFI_REL_OFFSET  rdx,RDX
+        CFI_REL_OFFSET  rsi,RSI
+        CFI_REL_OFFSET  rdi,RDI
+        CFI_REL_OFFSET  rip,RIP
+        /*CFI_REL_OFFSET        cs,CS*/
+        /*CFI_REL_OFFSET        rflags,EFLAGS*/
+        CFI_REL_OFFSET  rsp,RSP
+        /*CFI_REL_OFFSET        ss,SS*/
+        .endm
+/*
+ * A newly forked process directly context switches into this.
+ */     
+/* rdi: prev */ 
+ENTRY(ret_from_fork)
+        CFI_DEFAULT_STACK
+        push kernel_eflags(%rip)
+        CFI_ADJUST_CFA_OFFSET 4
+        popf                            # reset kernel eflags
+        CFI_ADJUST_CFA_OFFSET -4
+        call schedule_tail
+        GET_THREAD_INFO(%rcx)
+        testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+        jnz rff_trace
+rff_action:     
+        RESTORE_REST
+        testl $3,CS-ARGOFFSET(%rsp)     # from kernel_thread?
+        je   int_ret_from_sys_call
+        testl $_TIF_IA32,threadinfo_flags(%rcx)
+        jnz  int_ret_from_sys_call
+        RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
+        jmp ret_from_sys_call
+rff_trace:
+        movq %rsp,%rdi
+        call syscall_trace_leave
+        GET_THREAD_INFO(%rcx)   
+        jmp rff_action
+        CFI_ENDPROC
+END(ret_from_fork)
+/*
+ * System call entry. Upto 6 arguments in registers are supported.
+ *
+ * SYSCALL does not save anything on the stack and does not change the
+ * stack pointer.
+ */
+                
+/*
+ * Register setup:      
+ * rax  system call number
+ * rdi  arg0
+ * rcx  return address for syscall/sysret, C arg3 
+ * rsi  arg1
+ * rdx  arg2    
+ * r10  arg3    (--> moved to rcx for C)
+ * r8   arg4
+ * r9   arg5
+ * r11  eflags for syscall/sysret, temporary for C
+ * r12-r15,rbp,rbx saved by C code, not touched.                
+ * 
+ * Interrupts are off on entry.
+ * Only called from user space.
+ *
+ * XXX  if we had a free scratch register we could save the RSP into the stack frame
+ *      and report it properly in ps. Unfortunately we haven't.
+ *
+ * When user can change the frames always force IRET. That is because
+ * it deals with uncanonical addresses better. SYSRET has trouble
+ * with them due to bugs in both AMD and Intel CPUs.
+ */                                     
+ENTRY(system_call)
+        CFI_STARTPROC   simple
+        CFI_SIGNAL_FRAME
+        CFI_DEF_CFA     rsp,PDA_STACKOFFSET
+        CFI_REGISTER    rip,rcx
+        /*CFI_REGISTER  rflags,r11*/
+        swapgs
+        movq    %rsp,%gs:pda_oldrsp 
+        movq    %gs:pda_kernelstack,%rsp
+        /*
+         * No need to follow this irqs off/on section - it's straight
+         * and short:
+         */
+        sti                                     
+        SAVE_ARGS 8,1
+        movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
+        movq  %rcx,RIP-ARGOFFSET(%rsp)
+        CFI_REL_OFFSET rip,RIP-ARGOFFSET
+        GET_THREAD_INFO(%rcx)
+        testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
+        jnz tracesys
+        cmpq $__NR_syscall_max,%rax
+        ja badsys
+        movq %r10,%rcx
+        call *sys_call_table(,%rax,8)  # XXX:    rip relative
+        movq %rax,RAX-ARGOFFSET(%rsp)
+/*
+ * Syscall return path ending with SYSRET (fast path)
+ * Has incomplete stack frame and undefined top of stack. 
+ */             
+ret_from_sys_call:
+        movl $_TIF_ALLWORK_MASK,%edi
+        /* edi: flagmask */
+sysret_check:           
+        GET_THREAD_INFO(%rcx)
+        cli
+        TRACE_IRQS_OFF
+        movl threadinfo_flags(%rcx),%edx
+        andl %edi,%edx
+        jnz  sysret_careful 
+        CFI_REMEMBER_STATE
+        /*
+         * sysretq will re-enable interrupts:
+         */
+        TRACE_IRQS_ON
+        movq RIP-ARGOFFSET(%rsp),%rcx
+        CFI_REGISTER    rip,rcx
+        RESTORE_ARGS 0,-ARG_SKIP,1
+        /*CFI_REGISTER  rflags,r11*/
+        movq    %gs:pda_oldrsp,%rsp
+        swapgs
+        sysretq
+        CFI_RESTORE_STATE
+        /* Handle reschedules */
+        /* edx: work, edi: workmask */  
+sysret_careful:
+        bt $TIF_NEED_RESCHED,%edx
+        jnc sysret_signal
+        TRACE_IRQS_ON
+        sti
+        pushq %rdi
+        CFI_ADJUST_CFA_OFFSET 8
+        call schedule
+        popq  %rdi
+        CFI_ADJUST_CFA_OFFSET -8
+        jmp sysret_check
+        /* Handle a signal */ 
+sysret_signal:
+        TRACE_IRQS_ON
+        sti
+        testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+        jz    1f
+        /* Really a signal */
+        /* edx: work flags (arg3) */
+        leaq do_notify_resume(%rip),%rax
+        leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
+        xorl %esi,%esi # oldset -> arg2
+        call ptregscall_common
+1:      movl $_TIF_NEED_RESCHED,%edi
+        /* Use IRET because user could have changed frame. This
+           works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
+        cli
+        TRACE_IRQS_OFF
+        jmp int_with_check
+        
+badsys:
+        movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+        jmp ret_from_sys_call
+        /* Do syscall tracing */
+tracesys:                        
+        SAVE_REST
+        movq $-ENOSYS,RAX(%rsp)
+        FIXUP_TOP_OF_STACK %rdi
+        movq %rsp,%rdi
+        call syscall_trace_enter
+        LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+        RESTORE_REST
+        cmpq $__NR_syscall_max,%rax
+        movq $-ENOSYS,%rcx
+        cmova %rcx,%rax
+        ja  1f
+        movq %r10,%rcx  /* fixup for C */
+        call *sys_call_table(,%rax,8)
+1:      movq %rax,RAX-ARGOFFSET(%rsp)
+        /* Use IRET because user could have changed frame */
+                
+/* 
+ * Syscall return path ending with IRET.
+ * Has correct top of stack, but partial stack frame.
+ */
+        .globl int_ret_from_sys_call
+int_ret_from_sys_call:
+        cli
+        TRACE_IRQS_OFF
+        testl $3,CS-ARGOFFSET(%rsp)
+        je retint_restore_args
+        movl $_TIF_ALLWORK_MASK,%edi
+        /* edi: mask to check */
+int_with_check:
+        GET_THREAD_INFO(%rcx)
+        movl threadinfo_flags(%rcx),%edx
+        andl %edi,%edx
+        jnz   int_careful
+        andl    $~TS_COMPAT,threadinfo_status(%rcx)
+        jmp   retint_swapgs
+        /* Either reschedule or signal or syscall exit tracking needed. */
+        /* First do a reschedule test. */
+        /* edx: work, edi: workmask */
+int_careful:
+        bt $TIF_NEED_RESCHED,%edx
+        jnc  int_very_careful
+        TRACE_IRQS_ON
+        sti
+        pushq %rdi
+        CFI_ADJUST_CFA_OFFSET 8
+        call schedule
+        popq %rdi
+        CFI_ADJUST_CFA_OFFSET -8
+        cli
+        TRACE_IRQS_OFF
+        jmp int_with_check
+        /* handle signals and tracing -- both require a full stack frame */
+int_very_careful:
+        TRACE_IRQS_ON
+        sti
+        SAVE_REST
+        /* Check for syscall exit trace */      
+        testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
+        jz int_signal
+        pushq %rdi
+        CFI_ADJUST_CFA_OFFSET 8
+        leaq 8(%rsp),%rdi       # &ptregs -> arg1       
+        call syscall_trace_leave
+        popq %rdi
+        CFI_ADJUST_CFA_OFFSET -8
+        andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
+        jmp int_restore_rest
+        
+int_signal:
+        testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+        jz 1f
+        movq %rsp,%rdi          # &ptregs -> arg1
+        xorl %esi,%esi          # oldset -> arg2
+        call do_notify_resume
+1:      movl $_TIF_NEED_RESCHED,%edi    
+int_restore_rest:
+        RESTORE_REST
+        cli
+        TRACE_IRQS_OFF
+        jmp int_with_check
+        CFI_ENDPROC
+END(system_call)
+                
+/* 
+ * Certain special system calls that need to save a complete full stack frame.
+ */                                                             
+        
+        .macro PTREGSCALL label,func,arg
+        .globl \label
+\label:
+        leaq    \func(%rip),%rax
+        leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
+        jmp     ptregscall_common
+END(\label)
+        .endm
+        CFI_STARTPROC
+        PTREGSCALL stub_clone, sys_clone, %r8
+        PTREGSCALL stub_fork, sys_fork, %rdi
+        PTREGSCALL stub_vfork, sys_vfork, %rdi
+        PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
+        PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
+        PTREGSCALL stub_iopl, sys_iopl, %rsi
+ENTRY(ptregscall_common)
+        popq %r11
+        CFI_ADJUST_CFA_OFFSET -8
+        CFI_REGISTER rip, r11
+        SAVE_REST
+        movq %r11, %r15
+        CFI_REGISTER rip, r15
+        FIXUP_TOP_OF_STACK %r11
+        call *%rax
+        RESTORE_TOP_OF_STACK %r11
+        movq %r15, %r11
+        CFI_REGISTER rip, r11
+        RESTORE_REST
+        pushq %r11
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rip, 0
+        ret
+        CFI_ENDPROC
+END(ptregscall_common)
+        
+ENTRY(stub_execve)
+        CFI_STARTPROC
+        popq %r11
+        CFI_ADJUST_CFA_OFFSET -8
+        CFI_REGISTER rip, r11
+        SAVE_REST
+        FIXUP_TOP_OF_STACK %r11
+        call sys_execve
+        RESTORE_TOP_OF_STACK %r11
+        movq %rax,RAX(%rsp)
+        RESTORE_REST
+        jmp int_ret_from_sys_call
+        CFI_ENDPROC
+END(stub_execve)
+        
+/*
+ * sigreturn is special because it needs to restore all registers on return.
+ * This cannot be done with SYSRET, so use the IRET return path instead.
+ */                
+ENTRY(stub_rt_sigreturn)
+        CFI_STARTPROC
+        addq $8, %rsp
+        CFI_ADJUST_CFA_OFFSET   -8
+        SAVE_REST
+        movq %rsp,%rdi
+        FIXUP_TOP_OF_STACK %r11
+        call sys_rt_sigreturn
+        movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+        RESTORE_REST
+        jmp int_ret_from_sys_call
+        CFI_ENDPROC
+END(stub_rt_sigreturn)
+/*
+ * initial frame state for interrupts and exceptions
+ */
+        .macro _frame ref
+        CFI_STARTPROC simple
+        CFI_SIGNAL_FRAME
+        CFI_DEF_CFA rsp,SS+8-\ref
+        /*CFI_REL_OFFSET ss,SS-\ref*/
+        CFI_REL_OFFSET rsp,RSP-\ref
+        /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
+        /*CFI_REL_OFFSET cs,CS-\ref*/
+        CFI_REL_OFFSET rip,RIP-\ref
+        .endm
+/* initial frame state for interrupts (and exceptions without error code) */
+#define INTR_FRAME _frame RIP
+/* initial frame state for exceptions with error code (and interrupts with
+   vector already pushed) */
+#define XCPT_FRAME _frame ORIG_RAX
+/* 
+ * Interrupt entry/exit.
+ *
+ * Interrupt entry points save only callee clobbered registers in fast path.
+ *      
+ * Entry runs with interrupts off.      
+ */ 
+/* 0(%rsp): interrupt number */ 
+        .macro interrupt func
+        cld
+        SAVE_ARGS
+        leaq -ARGOFFSET(%rsp),%rdi      # arg1 for handler
+        pushq %rbp
+        CFI_ADJUST_CFA_OFFSET   8
+        CFI_REL_OFFSET          rbp, 0
+        movq %rsp,%rbp
+        CFI_DEF_CFA_REGISTER    rbp
+        testl $3,CS(%rdi)
+        je 1f
+        swapgs  
+        /* irqcount is used to check if a CPU is already on an interrupt
+           stack or not. While this is essentially redundant with preempt_count
+           it is a little cheaper to use a separate counter in the PDA
+           (short of moving irq_enter into assembly, which would be too
+            much work) */
+1:      incl    %gs:pda_irqcount
+        cmoveq %gs:pda_irqstackptr,%rsp
+        push    %rbp                    # backlink for old unwinder
+        /*
+         * We entered an interrupt context - irqs are off:
+         */
+        TRACE_IRQS_OFF
+        call \func
+        .endm
+ENTRY(common_interrupt)
+        XCPT_FRAME
+        interrupt do_IRQ
+        /* 0(%rsp): oldrsp-ARGOFFSET */
+ret_from_intr:
+        cli     
+        TRACE_IRQS_OFF
+        decl %gs:pda_irqcount
+        leaveq
+        CFI_DEF_CFA_REGISTER    rsp
+        CFI_ADJUST_CFA_OFFSET   -8
+exit_intr:
+        GET_THREAD_INFO(%rcx)
+        testl $3,CS-ARGOFFSET(%rsp)
+        je retint_kernel
+        
+        /* Interrupt came from user space */
+        /*
+         * Has a correct top of stack, but a partial stack frame
+         * %rcx: thread info. Interrupts off.
+         */             
+retint_with_reschedule:
+        movl $_TIF_WORK_MASK,%edi
+retint_check:
+        movl threadinfo_flags(%rcx),%edx
+        andl %edi,%edx
+        CFI_REMEMBER_STATE
+        jnz  retint_careful
+retint_swapgs:          
+        /*
+         * The iretq could re-enable interrupts:
+         */
+        cli
+        TRACE_IRQS_IRETQ
+        swapgs 
+        jmp restore_args
+retint_restore_args:                            
+        cli
+        /*
+         * The iretq could re-enable interrupts:
+         */
+        TRACE_IRQS_IRETQ
+restore_args:
+        RESTORE_ARGS 0,8,0                                              
+iret_label:     
+        iretq
+        .section __ex_table,"a"
+        .quad iret_label,bad_iret       
+        .previous
+        .section .fixup,"ax"
+        /* force a signal here? this matches i386 behaviour */
+        /* running with kernel gs */
+bad_iret:
+        movq $11,%rdi   /* SIGSEGV */
+        TRACE_IRQS_ON
+        sti
+        jmp do_exit                     
+        .previous       
+        
+        /* edi: workmask, edx: work */
+retint_careful:
+        CFI_RESTORE_STATE
+        bt    $TIF_NEED_RESCHED,%edx
+        jnc   retint_signal
+        TRACE_IRQS_ON
+        sti
+        pushq %rdi
+        CFI_ADJUST_CFA_OFFSET   8
+        call  schedule
+        popq %rdi               
+        CFI_ADJUST_CFA_OFFSET   -8
+        GET_THREAD_INFO(%rcx)
+        cli
+        TRACE_IRQS_OFF
+        jmp retint_check
+        
+retint_signal:
+        testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+        jz    retint_swapgs
+        TRACE_IRQS_ON
+        sti
+        SAVE_REST
+        movq $-1,ORIG_RAX(%rsp)                         
+        xorl %esi,%esi          # oldset
+        movq %rsp,%rdi          # &pt_regs
+        call do_notify_resume
+        RESTORE_REST
+        cli
+        TRACE_IRQS_OFF
+        movl $_TIF_NEED_RESCHED,%edi
+        GET_THREAD_INFO(%rcx)
+        jmp retint_check
+#ifdef CONFIG_PREEMPT
+        /* Returning to kernel space. Check if we need preemption */
+        /* rcx:  threadinfo. interrupts off. */
+ENTRY(retint_kernel)
+        cmpl $0,threadinfo_preempt_count(%rcx)
+        jnz  retint_restore_args
+        bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
+        jnc  retint_restore_args
+        bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
+        jnc  retint_restore_args
+        call preempt_schedule_irq
+        jmp exit_intr
+#endif  
+        CFI_ENDPROC
+END(common_interrupt)
+        
+/*
+ * APIC interrupts.
+ */             
+        .macro apicinterrupt num,func
+        INTR_FRAME
+        pushq $~(\num)
+        CFI_ADJUST_CFA_OFFSET 8
+        interrupt \func
+        jmp ret_from_intr
+        CFI_ENDPROC
+        .endm
+ENTRY(thermal_interrupt)
+        apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
+END(thermal_interrupt)
+ENTRY(threshold_interrupt)
+        apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
+END(threshold_interrupt)
+#ifdef CONFIG_SMP       
+ENTRY(reschedule_interrupt)
+        apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
+END(reschedule_interrupt)
+        .macro INVALIDATE_ENTRY num
+ENTRY(invalidate_interrupt\num)
+        apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt 
+END(invalidate_interrupt\num)
+        .endm
+        INVALIDATE_ENTRY 0
+        INVALIDATE_ENTRY 1
+        INVALIDATE_ENTRY 2
+        INVALIDATE_ENTRY 3
+        INVALIDATE_ENTRY 4
+        INVALIDATE_ENTRY 5
+        INVALIDATE_ENTRY 6
+        INVALIDATE_ENTRY 7
+ENTRY(call_function_interrupt)
+        apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
+END(call_function_interrupt)
+ENTRY(irq_move_cleanup_interrupt)
+        apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
+END(irq_move_cleanup_interrupt)
+#endif
+ENTRY(apic_timer_interrupt)
+        apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
+END(apic_timer_interrupt)
+ENTRY(error_interrupt)
+        apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
+END(error_interrupt)
+ENTRY(spurious_interrupt)
+        apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
+END(spurious_interrupt)
+                                
+/*
+ * Exception entry points.
+ */             
+        .macro zeroentry sym
+        INTR_FRAME
+        pushq $0        /* push error code/oldrax */ 
+        CFI_ADJUST_CFA_OFFSET 8
+        pushq %rax      /* push real oldrax to the rdi slot */ 
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rax,0
+        leaq  \sym(%rip),%rax
+        jmp error_entry
+        CFI_ENDPROC
+        .endm   
+        .macro errorentry sym
+        XCPT_FRAME
+        pushq %rax
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rax,0
+        leaq  \sym(%rip),%rax
+        jmp error_entry
+        CFI_ENDPROC
+        .endm
+        /* error code is on the stack already */
+        /* handle NMI like exceptions that can happen everywhere */
+        .macro paranoidentry sym, ist=0, irqtrace=1
+        SAVE_ALL
+        cld
+        movl $1,%ebx
+        movl  $MSR_GS_BASE,%ecx
+        rdmsr
+        testl %edx,%edx
+        js    1f
+        swapgs
+        xorl  %ebx,%ebx
+1:
+        .if \ist
+        movq    %gs:pda_data_offset, %rbp
+        .endif
+        movq %rsp,%rdi
+        movq ORIG_RAX(%rsp),%rsi
+        movq $-1,ORIG_RAX(%rsp)
+        .if \ist
+        subq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+        .endif
+        call \sym
+        .if \ist
+        addq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+        .endif
+        cli
+        .if \irqtrace
+        TRACE_IRQS_OFF
+        .endif
+        .endm
+        /*
+         * "Paranoid" exit path from exception stack.
+         * Paranoid because this is used by NMIs and cannot take
+         * any kernel state for granted.
+         * We don't do kernel preemption checks here, because only
+         * NMI should be common and it does not enable IRQs and
+         * cannot get reschedule ticks.
+         *
+         * "trace" is 0 for the NMI handler only, because irq-tracing
+         * is fundamentally NMI-unsafe. (we cannot change the soft and
+         * hard flags at once, atomically)
+         */
+        .macro paranoidexit trace=1
+        /* ebx: no swapgs flag */
+paranoid_exit\trace:
+        testl %ebx,%ebx                         /* swapgs needed? */
+        jnz paranoid_restore\trace
+        testl $3,CS(%rsp)
+        jnz   paranoid_userspace\trace
+paranoid_swapgs\trace:
+        .if \trace
+        TRACE_IRQS_IRETQ 0
+        .endif
+        swapgs
+paranoid_restore\trace:
+        RESTORE_ALL 8
+        iretq
+paranoid_userspace\trace:
+        GET_THREAD_INFO(%rcx)
+        movl threadinfo_flags(%rcx),%ebx
+        andl $_TIF_WORK_MASK,%ebx
+        jz paranoid_swapgs\trace
+        movq %rsp,%rdi                  /* &pt_regs */
+        call sync_regs
+        movq %rax,%rsp                  /* switch stack for scheduling */
+        testl $_TIF_NEED_RESCHED,%ebx
+        jnz paranoid_schedule\trace
+        movl %ebx,%edx                  /* arg3: thread flags */
+        .if \trace
+        TRACE_IRQS_ON
+        .endif
+        sti
+        xorl %esi,%esi                  /* arg2: oldset */
+        movq %rsp,%rdi                  /* arg1: &pt_regs */
+        call do_notify_resume
+        cli
+        .if \trace
+        TRACE_IRQS_OFF
+        .endif
+        jmp paranoid_userspace\trace
+paranoid_schedule\trace:
+        .if \trace
+        TRACE_IRQS_ON
+        .endif
+        sti
+        call schedule
+        cli
+        .if \trace
+        TRACE_IRQS_OFF
+        .endif
+        jmp paranoid_userspace\trace
+        CFI_ENDPROC
+        .endm
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack
+ * and the exception handler in %rax.   
+ */                                             
+KPROBE_ENTRY(error_entry)
+        _frame RDI
+        CFI_REL_OFFSET rax,0
+        /* rdi slot contains rax, oldrax contains error code */
+        cld     
+        subq  $14*8,%rsp
+        CFI_ADJUST_CFA_OFFSET   (14*8)
+        movq %rsi,13*8(%rsp)
+        CFI_REL_OFFSET  rsi,RSI
+        movq 14*8(%rsp),%rsi    /* load rax from rdi slot */
+        CFI_REGISTER    rax,rsi
+        movq %rdx,12*8(%rsp)
+        CFI_REL_OFFSET  rdx,RDX
+        movq %rcx,11*8(%rsp)
+        CFI_REL_OFFSET  rcx,RCX
+        movq %rsi,10*8(%rsp)    /* store rax */ 
+        CFI_REL_OFFSET  rax,RAX
+        movq %r8, 9*8(%rsp)
+        CFI_REL_OFFSET  r8,R8
+        movq %r9, 8*8(%rsp)
+        CFI_REL_OFFSET  r9,R9
+        movq %r10,7*8(%rsp)
+        CFI_REL_OFFSET  r10,R10
+        movq %r11,6*8(%rsp)
+        CFI_REL_OFFSET  r11,R11
+        movq %rbx,5*8(%rsp) 
+        CFI_REL_OFFSET  rbx,RBX
+        movq %rbp,4*8(%rsp) 
+        CFI_REL_OFFSET  rbp,RBP
+        movq %r12,3*8(%rsp) 
+        CFI_REL_OFFSET  r12,R12
+        movq %r13,2*8(%rsp) 
+        CFI_REL_OFFSET  r13,R13
+        movq %r14,1*8(%rsp) 
+        CFI_REL_OFFSET  r14,R14
+        movq %r15,(%rsp) 
+        CFI_REL_OFFSET  r15,R15
+        xorl %ebx,%ebx  
+        testl $3,CS(%rsp)
+        je  error_kernelspace
+error_swapgs:   
+        swapgs
+error_sti:      
+        movq %rdi,RDI(%rsp)     
+        CFI_REL_OFFSET  rdi,RDI
+        movq %rsp,%rdi
+        movq ORIG_RAX(%rsp),%rsi        /* get error code */ 
+        movq $-1,ORIG_RAX(%rsp)
+        call *%rax
+        /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */     
+error_exit:             
+        movl %ebx,%eax          
+        RESTORE_REST
+        cli
+        TRACE_IRQS_OFF
+        GET_THREAD_INFO(%rcx)   
+        testl %eax,%eax
+        jne  retint_kernel
+        movl  threadinfo_flags(%rcx),%edx
+        movl  $_TIF_WORK_MASK,%edi
+        andl  %edi,%edx
+        jnz  retint_careful
+        /*
+         * The iret might restore flags:
+         */
+        TRACE_IRQS_IRETQ
+        swapgs 
+        RESTORE_ARGS 0,8,0                                              
+        jmp iret_label
+        CFI_ENDPROC
+error_kernelspace:
+        incl %ebx
+       /* There are two places in the kernel that can potentially fault with
+          usergs. Handle them here. The exception handlers after
+           iret run with kernel gs again, so don't set the user space flag.
+           B stepping K8s sometimes report an truncated RIP for IRET 
+           exceptions returning to compat mode. Check for these here too. */
+        leaq iret_label(%rip),%rbp
+        cmpq %rbp,RIP(%rsp) 
+        je   error_swapgs
+        movl %ebp,%ebp  /* zero extend */
+        cmpq %rbp,RIP(%rsp) 
+        je   error_swapgs
+        cmpq $gs_change,RIP(%rsp)
+        je   error_swapgs
+        jmp  error_sti
+KPROBE_END(error_entry)
+        
+       /* Reload gs selector with exception handling */
+       /* edi:  new selector */ 
+ENTRY(load_gs_index)
+        CFI_STARTPROC
+        pushf
+        CFI_ADJUST_CFA_OFFSET 8
+        cli
+        swapgs
+gs_change:     
+        movl %edi,%gs   
+2:      mfence          /* workaround */
+        swapgs
+        popf
+        CFI_ADJUST_CFA_OFFSET -8
+        ret
+        CFI_ENDPROC
+ENDPROC(load_gs_index)
+       
+        .section __ex_table,"a"
+        .align 8
+        .quad gs_change,bad_gs
+        .previous
+        .section .fixup,"ax"
+        /* running with kernelgs */
+bad_gs: 
+        swapgs                  /* switch back to user gs */
+        xorl %eax,%eax
+        movl %eax,%gs
+        jmp  2b
+        .previous       
+        
+/*
+ * Create a kernel thread.
+ *
+ * C extern interface:
+ *      extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+ *
+ * asm input arguments:
+ *      rdi: fn, rsi: arg, rdx: flags
+ */
+ENTRY(kernel_thread)
+        CFI_STARTPROC
+        FAKE_STACK_FRAME $child_rip
+        SAVE_ALL
+        # rdi: flags, rsi: usp, rdx: will be &pt_regs
+        movq %rdx,%rdi
+        orq  kernel_thread_flags(%rip),%rdi
+        movq $-1, %rsi
+        movq %rsp, %rdx
+        xorl %r8d,%r8d
+        xorl %r9d,%r9d
+        
+        # clone now
+        call do_fork
+        movq %rax,RAX(%rsp)
+        xorl %edi,%edi
+        /*
+         * It isn't worth to check for reschedule here,
+         * so internally to the x86_64 port you can rely on kernel_thread()
+         * not to reschedule the child before returning, this avoids the need
+         * of hacks for example to fork off the per-CPU idle tasks.
+         * [Hopefully no generic code relies on the reschedule -AK]     
+         */
+        RESTORE_ALL
+        UNFAKE_STACK_FRAME
+        ret
+        CFI_ENDPROC
+ENDPROC(kernel_thread)
+        
+child_rip:
+        pushq $0                # fake return address
+        CFI_STARTPROC
+        /*
+         * Here we are in the child and the registers are set as they were
+         * at kernel_thread() invocation in the parent.
+         */
+        movq %rdi, %rax
+        movq %rsi, %rdi
+        call *%rax
+        # exit
+        xorl %edi, %edi
+        call do_exit
+        CFI_ENDPROC
+ENDPROC(child_rip)
+/*
+ * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
+ *
+ * C extern interface:
+ *       extern long execve(char *name, char **argv, char **envp)
+ *
+ * asm input arguments:
+ *      rdi: name, rsi: argv, rdx: envp
+ *
+ * We want to fallback into:
+ *      extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
+ *
+ * do_sys_execve asm fallback arguments:
+ *      rdi: name, rsi: argv, rdx: envp, fake frame on the stack
+ */
+ENTRY(kernel_execve)
+        CFI_STARTPROC
+        FAKE_STACK_FRAME $0
+        SAVE_ALL        
+        call sys_execve
+        movq %rax, RAX(%rsp)    
+        RESTORE_REST
+        testq %rax,%rax
+        je int_ret_from_sys_call
+        RESTORE_ARGS
+        UNFAKE_STACK_FRAME
+        ret
+        CFI_ENDPROC
+ENDPROC(kernel_execve)
+KPROBE_ENTRY(page_fault)
+        errorentry do_page_fault
+KPROBE_END(page_fault)
+ENTRY(coprocessor_error)
+        zeroentry do_coprocessor_error
+END(coprocessor_error)
+ENTRY(simd_coprocessor_error)
+        zeroentry do_simd_coprocessor_error     
+END(simd_coprocessor_error)
+ENTRY(device_not_available)
+        zeroentry math_state_restore
+END(device_not_available)
+        /* runs on exception stack */
+KPROBE_ENTRY(debug)
+        INTR_FRAME
+        pushq $0
+        CFI_ADJUST_CFA_OFFSET 8         
+        paranoidentry do_debug, DEBUG_STACK
+        paranoidexit
+KPROBE_END(debug)
+        /* runs on exception stack */   
+KPROBE_ENTRY(nmi)
+        INTR_FRAME
+        pushq $-1
+        CFI_ADJUST_CFA_OFFSET 8
+        paranoidentry do_nmi, 0, 0
+#ifdef CONFIG_TRACE_IRQFLAGS
+        paranoidexit 0
+#else
+        jmp paranoid_exit1
+        CFI_ENDPROC
+#endif
+KPROBE_END(nmi)
+KPROBE_ENTRY(int3)
+        INTR_FRAME
+        pushq $0
+        CFI_ADJUST_CFA_OFFSET 8
+        paranoidentry do_int3, DEBUG_STACK
+        jmp paranoid_exit1
+        CFI_ENDPROC
+KPROBE_END(int3)
+ENTRY(overflow)
+        zeroentry do_overflow
+END(overflow)
+ENTRY(bounds)
+        zeroentry do_bounds
+END(bounds)
+ENTRY(invalid_op)
+        zeroentry do_invalid_op 
+END(invalid_op)
+ENTRY(coprocessor_segment_overrun)
+        zeroentry do_coprocessor_segment_overrun
+END(coprocessor_segment_overrun)
+ENTRY(reserved)
+        zeroentry do_reserved
+END(reserved)
+        /* runs on exception stack */
+ENTRY(double_fault)
+        XCPT_FRAME
+        paranoidentry do_double_fault
+        jmp paranoid_exit1
+        CFI_ENDPROC
+END(double_fault)
+ENTRY(invalid_TSS)
+        errorentry do_invalid_TSS
+END(invalid_TSS)
+ENTRY(segment_not_present)
+        errorentry do_segment_not_present
+END(segment_not_present)
+        /* runs on exception stack */
+ENTRY(stack_segment)
+        XCPT_FRAME
+        paranoidentry do_stack_segment
+        jmp paranoid_exit1
+        CFI_ENDPROC
+END(stack_segment)
+KPROBE_ENTRY(general_protection)
+        errorentry do_general_protection
+KPROBE_END(general_protection)
+ENTRY(alignment_check)
+        errorentry do_alignment_check
+END(alignment_check)
+ENTRY(divide_error)
+        zeroentry do_divide_error
+END(divide_error)
+ENTRY(spurious_interrupt_bug)
+        zeroentry do_spurious_interrupt_bug
+END(spurious_interrupt_bug)
+#ifdef CONFIG_X86_MCE
+        /* runs on exception stack */
+ENTRY(machine_check)
+        INTR_FRAME
+        pushq $0
+        CFI_ADJUST_CFA_OFFSET 8 
+        paranoidentry do_machine_check
+        jmp paranoid_exit1
+        CFI_ENDPROC
+END(machine_check)
+#endif
+/* Call softirq on interrupt stack. Interrupts are off. */
+ENTRY(call_softirq)
+        CFI_STARTPROC
+        push %rbp
+        CFI_ADJUST_CFA_OFFSET   8
+        CFI_REL_OFFSET rbp,0
+        mov  %rsp,%rbp
+        CFI_DEF_CFA_REGISTER rbp
+        incl %gs:pda_irqcount
+        cmove %gs:pda_irqstackptr,%rsp
+        push  %rbp                      # backlink for old unwinder
+        call __do_softirq
+        leaveq
+        CFI_DEF_CFA_REGISTER    rsp
+        CFI_ADJUST_CFA_OFFSET   -8
+        decl %gs:pda_irqcount
+        ret
+        CFI_ENDPROC
+ENDPROC(call_softirq)
+KPROBE_ENTRY(ignore_sysret)
+        CFI_STARTPROC
+        mov $-ENOSYS,%eax
+        sysret
+        CFI_ENDPROC
+ENDPROC(ignore_sysret)
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
new file mode 100644
index 000000000000..47496a40e84f
--- /dev/null
+++ b/arch/x86/kernel/genapic_64.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch probe layer.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <asm/smp.h>
+#include <asm/ipi.h>
+#include <asm/genapic.h>
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+/* which logical CPU number maps to which CPU (physical APIC ID) */
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
+                                        = { [0 ... NR_CPUS-1] = BAD_APICID };
+EXPORT_SYMBOL(x86_cpu_to_apicid);
+u8 x86_cpu_to_log_apicid[NR_CPUS]       = { [0 ... NR_CPUS-1] = BAD_APICID };
+struct genapic __read_mostly *genapic = &apic_flat;
+/*
+ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
+ */
+void __init setup_apic_routing(void)
+{
+#ifdef CONFIG_ACPI
+        /*
+         * Quirk: some x86_64 machines can only use physical APIC mode
+         * regardless of how many processors are present (x86_64 ES7000
+         * is an example).
+         */
+        if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+                        (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
+                genapic = &apic_physflat;
+        else
+#endif
+        if (cpus_weight(cpu_possible_map) <= 8)
+                genapic = &apic_flat;
+        else
+                genapic = &apic_physflat;
+        printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
+}
+/* Same for both flat and physical. */
+void send_IPI_self(int vector)
+{
+        __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+}
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
new file mode 100644
index 000000000000..ecb01eefdd27
--- /dev/null
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Flat APIC subarch code.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/errno.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <asm/smp.h>
+#include <asm/ipi.h>
+#include <asm/genapic.h>
+static cpumask_t flat_target_cpus(void)
+{
+        return cpu_online_map;
+}
+static cpumask_t flat_vector_allocation_domain(int cpu)
+{
+        /* Careful. Some cpus do not strictly honor the set of cpus
+         * specified in the interrupt destination when using lowest
+         * priority interrupt delivery mode.
+         *
+         * In particular there was a hyperthreading cpu observed to
+         * deliver interrupts to the wrong hyperthread when only one
+         * hyperthread was specified in the interrupt desitination.
+         */
+        cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+        return domain;
+}
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116).  So here it goes...
+ */
+static void flat_init_apic_ldr(void)
+{
+        unsigned long val;
+        unsigned long num, id;
+        num = smp_processor_id();
+        id = 1UL << num;
+        x86_cpu_to_log_apicid[num] = id;
+        apic_write(APIC_DFR, APIC_DFR_FLAT);
+        val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+        val |= SET_APIC_LOGICAL_ID(id);
+        apic_write(APIC_LDR, val);
+}
+static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
+{
+        unsigned long mask = cpus_addr(cpumask)[0];
+        unsigned long flags;
+        local_irq_save(flags);
+        __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
+        local_irq_restore(flags);
+}
+static void flat_send_IPI_allbutself(int vector)
+{
+#ifdef  CONFIG_HOTPLUG_CPU
+        int hotplug = 1;
+#else
+        int hotplug = 0;
+#endif
+        if (hotplug || vector == NMI_VECTOR) {
+                cpumask_t allbutme = cpu_online_map;
+                cpu_clear(smp_processor_id(), allbutme);
+                if (!cpus_empty(allbutme))
+                        flat_send_IPI_mask(allbutme, vector);
+        } else if (num_online_cpus() > 1) {
+                __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
+        }
+}
+static void flat_send_IPI_all(int vector)
+{
+        if (vector == NMI_VECTOR)
+                flat_send_IPI_mask(cpu_online_map, vector);
+        else
+                __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+}
+static int flat_apic_id_registered(void)
+{
+        return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
+}
+static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+        return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
+}
+static unsigned int phys_pkg_id(int index_msb)
+{
+        return hard_smp_processor_id() >> index_msb;
+}
+struct genapic apic_flat =  {
+        .name = "flat",
+        .int_delivery_mode = dest_LowestPrio,
+        .int_dest_mode = (APIC_DEST_LOGICAL != 0),
+        .target_cpus = flat_target_cpus,
+        .vector_allocation_domain = flat_vector_allocation_domain,
+        .apic_id_registered = flat_apic_id_registered,
+        .init_apic_ldr = flat_init_apic_ldr,
+        .send_IPI_all = flat_send_IPI_all,
+        .send_IPI_allbutself = flat_send_IPI_allbutself,
+        .send_IPI_mask = flat_send_IPI_mask,
+        .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+        .phys_pkg_id = phys_pkg_id,
+};
+/*
+ * Physflat mode is used when there are more than 8 CPUs on a AMD system.
+ * We cannot use logical delivery in this case because the mask
+ * overflows, so use physical mode.
+ */
+static cpumask_t physflat_target_cpus(void)
+{
+        return cpu_online_map;
+}
+static cpumask_t physflat_vector_allocation_domain(int cpu)
+{
+        cpumask_t domain = CPU_MASK_NONE;
+        cpu_set(cpu, domain);
+        return domain;
+}
+static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
+{
+        send_IPI_mask_sequence(cpumask, vector);
+}
+static void physflat_send_IPI_allbutself(int vector)
+{
+        cpumask_t allbutme = cpu_online_map;
+        cpu_clear(smp_processor_id(), allbutme);
+        physflat_send_IPI_mask(allbutme, vector);
+}
+static void physflat_send_IPI_all(int vector)
+{
+        physflat_send_IPI_mask(cpu_online_map, vector);
+}
+static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+        int cpu;
+        /*
+         * We're using fixed IRQ delivery, can only return one phys APIC ID.
+         * May as well be the first.
+         */
+        cpu = first_cpu(cpumask);
+        if ((unsigned)cpu < NR_CPUS)
+                return x86_cpu_to_apicid[cpu];
+        else
+                return BAD_APICID;
+}
+struct genapic apic_physflat =  {
+        .name = "physical flat",
+        .int_delivery_mode = dest_Fixed,
+        .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
+        .target_cpus = physflat_target_cpus,
+        .vector_allocation_domain = physflat_vector_allocation_domain,
+        .apic_id_registered = flat_apic_id_registered,
+        .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
+        .send_IPI_all = physflat_send_IPI_all,
+        .send_IPI_allbutself = physflat_send_IPI_allbutself,
+        .send_IPI_mask = physflat_send_IPI_mask,
+        .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
+        .phys_pkg_id = phys_pkg_id,
+};
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
new file mode 100644
index 000000000000..41e8aec4c61d
--- /dev/null
+++ b/arch/x86/kernel/geode_32.c
@@ -0,0 +1,155 @@
+/*
+ * AMD Geode southbridge support code
+ * Copyright (C) 2006, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <asm/msr.h>
+#include <asm/geode.h>
+static struct {
+        char *name;
+        u32 msr;
+        int size;
+        u32 base;
+} lbars[] = {
+        { "geode-pms",   MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
+        { "geode-acpi",  MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
+        { "geode-gpio",  MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
+        { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
+};
+static void __init init_lbars(void)
+{
+        u32 lo, hi;
+        int i;
+        for (i = 0; i < ARRAY_SIZE(lbars); i++) {
+                rdmsr(lbars[i].msr, lo, hi);
+                if (hi & 0x01)
+                        lbars[i].base = lo & 0x0000ffff;
+                if (lbars[i].base == 0)
+                        printk(KERN_ERR "geode:  Couldn't initialize '%s'\n",
+                                        lbars[i].name);
+        }
+}
+int geode_get_dev_base(unsigned int dev)
+{
+        BUG_ON(dev >= ARRAY_SIZE(lbars));
+        return lbars[dev].base;
+}
+EXPORT_SYMBOL_GPL(geode_get_dev_base);
+/* === GPIO API === */
+void geode_gpio_set(unsigned int gpio, unsigned int reg)
+{
+        u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+        if (!base)
+                return;
+        if (gpio < 16)
+                outl(1 << gpio, base + reg);
+        else
+                outl(1 << (gpio - 16), base + 0x80 + reg);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_set);
+void geode_gpio_clear(unsigned int gpio, unsigned int reg)
+{
+        u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+        if (!base)
+                return;
+        if (gpio < 16)
+                outl(1 << (gpio + 16), base + reg);
+        else
+                outl(1 << gpio, base + 0x80 + reg);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_clear);
+int geode_gpio_isset(unsigned int gpio, unsigned int reg)
+{
+        u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+        if (!base)
+                return 0;
+        if (gpio < 16)
+                return (inl(base + reg) & (1 << gpio)) ? 1 : 0;
+        else
+                return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(geode_gpio_isset);
+void geode_gpio_set_irq(unsigned int group, unsigned int irq)
+{
+        u32 lo, hi;
+        if (group > 7 || irq > 15)
+                return;
+        rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
+        lo &= ~(0xF << (group * 4));
+        lo |= (irq & 0xF) << (group * 4);
+        wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
+void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
+{
+        u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+        u32 offset, shift, val;
+        if (gpio >= 24)
+                offset = GPIO_MAP_W;
+        else if (gpio >= 16)
+                offset = GPIO_MAP_Z;
+        else if (gpio >= 8)
+                offset = GPIO_MAP_Y;
+        else
+                offset = GPIO_MAP_X;
+        shift = (gpio % 8) * 4;
+        val = inl(base + offset);
+        /* Clear whatever was there before */
+        val &= ~(0xF << shift);
+        /* And set the new value */
+        val |= ((pair & 7) << shift);
+        /* Set the PME bit if this is a PME event */
+        if (pme)
+                val |= (1 << (shift + 3));
+        outl(val, base + offset);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
+static int __init geode_southbridge_init(void)
+{
+        if (!is_geode())
+                return -ENODEV;
+        init_lbars();
+        return 0;
+}
+postcore_initcall(geode_southbridge_init);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
new file mode 100644
index 000000000000..6c34bdd22e26
--- /dev/null
+++ b/arch/x86/kernel/head64.c
@@ -0,0 +1,86 @@
+/*
+ *  linux/arch/x86_64/kernel/head64.c -- prepare to run common code
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ */
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/percpu.h>
+#include <asm/processor.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/bootsetup.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+static void __init zap_identity_mappings(void)
+{
+        pgd_t *pgd = pgd_offset_k(0UL);
+        pgd_clear(pgd);
+        __flush_tlb();
+}
+/* Don't add a printk in there. printk relies on the PDA which is not initialized 
+   yet. */
+static void __init clear_bss(void)
+{
+        memset(__bss_start, 0,
+               (unsigned long) __bss_stop - (unsigned long) __bss_start);
+}
+#define NEW_CL_POINTER          0x228   /* Relative to real mode data */
+#define OLD_CL_MAGIC_ADDR       0x20
+#define OLD_CL_MAGIC            0xA33F
+#define OLD_CL_OFFSET           0x22
+static void __init copy_bootdata(char *real_mode_data)
+{
+        unsigned long new_data;
+        char * command_line;
+        memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
+        new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER);
+        if (!new_data) {
+                if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) {
+                        return;
+                }
+                new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET);
+        }
+        command_line = __va(new_data);
+        memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
+}
+void __init x86_64_start_kernel(char * real_mode_data)
+{
+        int i;
+        /* clear bss before set_intr_gate with early_idt_handler */
+        clear_bss();
+        /* Make NULL pointers segfault */
+        zap_identity_mappings();
+        for (i = 0; i < IDT_ENTRIES; i++)
+                set_intr_gate(i, early_idt_handler);
+        asm volatile("lidt %0" :: "m" (idt_descr));
+        early_printk("Kernel alive\n");
+        for (i = 0; i < NR_CPUS; i++)
+                cpu_pda(i) = &boot_cpu_pda[i];
+        pda_init(0);
+        copy_bootdata(__va(real_mode_data));
+#ifdef CONFIG_SMP
+        cpu_set(0, cpu_online_map);
+#endif
+        start_kernel();
+}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
new file mode 100644
index 000000000000..9150ca9b5f80
--- /dev/null
+++ b/arch/x86/kernel/head_32.S
@@ -0,0 +1,578 @@
+/*
+ *  linux/arch/i386/kernel/head.S -- the 32-bit startup code.
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Enhanced CPU detection and feature setting code by Mike Jagdis
+ *  and Martin Mares, November 1997.
+ */
+.text
+#include <linux/threads.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+#include <asm/setup.h>
+/*
+ * References to members of the new_cpu_data structure.
+ */
+#define X86             new_cpu_data+CPUINFO_x86
+#define X86_VENDOR      new_cpu_data+CPUINFO_x86_vendor
+#define X86_MODEL       new_cpu_data+CPUINFO_x86_model
+#define X86_MASK        new_cpu_data+CPUINFO_x86_mask
+#define X86_HARD_MATH   new_cpu_data+CPUINFO_hard_math
+#define X86_CPUID       new_cpu_data+CPUINFO_cpuid_level
+#define X86_CAPABILITY  new_cpu_data+CPUINFO_x86_capability
+#define X86_VENDOR_ID   new_cpu_data+CPUINFO_x86_vendor_id
+/*
+ * This is how much memory *in addition to the memory covered up to
+ * and including _end* we need mapped initially.
+ * We need:
+ *  - one bit for each possible page, but only in low memory, which means
+ *     2^32/4096/8 = 128K worst case (4G/4G split.)
+ *  - enough space to map all low memory, which means
+ *     (2^32/4096) / 1024 pages (worst case, non PAE)
+ *     (2^32/4096) / 512 + 4 pages (worst case for PAE)
+ *  - a few pages for allocator use before the kernel pagetable has
+ *     been set up
+ *
+ * Modulo rounding, each megabyte assigned here requires a kilobyte of
+ * memory, which is currently unreclaimed.
+ *
+ * This should be a multiple of a page.
+ */
+LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
+#if PTRS_PER_PMD > 1
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
+#else
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
+#endif
+BOOTBITMAP_SIZE = LOW_PAGES / 8
+ALLOCATOR_SLOP = 4
+INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+/*
+ * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
+ * %esi points to the real-mode code as a 32-bit pointer.
+ * CS and DS must be 4 GB flat segments, but we don't depend on
+ * any particular GDT layout, because we load our own as soon as we
+ * can.
+ */
+.section .text.head,"ax",@progbits
+ENTRY(startup_32)
+/*
+ * Set segments to known values.
+ */
+        cld
+        lgdt boot_gdt_descr - __PAGE_OFFSET
+        movl $(__BOOT_DS),%eax
+        movl %eax,%ds
+        movl %eax,%es
+        movl %eax,%fs
+        movl %eax,%gs
+/*
+ * Clear BSS first so that there are no surprises...
+ * No need to cld as DF is already clear from cld above...
+ */
+        xorl %eax,%eax
+        movl $__bss_start - __PAGE_OFFSET,%edi
+        movl $__bss_stop - __PAGE_OFFSET,%ecx
+        subl %edi,%ecx
+        shrl $2,%ecx
+        rep ; stosl
+/*
+ * Copy bootup parameters out of the way.
+ * Note: %esi still has the pointer to the real-mode data.
+ * With the kexec as boot loader, parameter segment might be loaded beyond
+ * kernel image and might not even be addressable by early boot page tables.
+ * (kexec on panic case). Hence copy out the parameters before initializing
+ * page tables.
+ */
+        movl $(boot_params - __PAGE_OFFSET),%edi
+        movl $(PARAM_SIZE/4),%ecx
+        cld
+        rep
+        movsl
+        movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi
+        andl %esi,%esi
+        jnz 2f                  # New command line protocol
+        cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
+        jne 1f
+        movzwl OLD_CL_OFFSET,%esi
+        addl $(OLD_CL_BASE_ADDR),%esi
+2:
+        movl $(boot_command_line - __PAGE_OFFSET),%edi
+        movl $(COMMAND_LINE_SIZE/4),%ecx
+        rep
+        movsl
+1:
+/*
+ * Initialize page tables.  This creates a PDE and a set of page
+ * tables, which are located immediately beyond _end.  The variable
+ * init_pg_tables_end is set up to point to the first "safe" location.
+ * Mappings are created both at virtual address 0 (identity mapping)
+ * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ *
+ * Warning: don't use %esi or the stack in this code.  However, %esp
+ * can be used as a GPR if you really need it...
+ */
+page_pde_offset = (__PAGE_OFFSET >> 20);
+        movl $(pg0 - __PAGE_OFFSET), %edi
+        movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
+        movl $0x007, %eax                       /* 0x007 = PRESENT+RW+USER */
+10:
+        leal 0x007(%edi),%ecx                   /* Create PDE entry */
+        movl %ecx,(%edx)                        /* Store identity PDE entry */
+        movl %ecx,page_pde_offset(%edx)         /* Store kernel PDE entry */
+        addl $4,%edx
+        movl $1024, %ecx
+11:
+        stosl
+        addl $0x1000,%eax
+        loop 11b
+        /* End condition: we must map up to and including INIT_MAP_BEYOND_END */
+        /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
+        leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
+        cmpl %ebp,%eax
+        jb 10b
+        movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
+        xorl %ebx,%ebx                          /* This is the boot CPU (BSP) */
+        jmp 3f
+/*
+ * Non-boot CPU entry point; entered from trampoline.S
+ * We can't lgdt here, because lgdt itself uses a data segment, but
+ * we know the trampoline has already loaded the boot_gdt for us.
+ *
+ * If cpu hotplug is not supported then this code can go in init section
+ * which will be freed later
+ */
+#ifndef CONFIG_HOTPLUG_CPU
+.section .init.text,"ax",@progbits
+#endif
+        /* Do an early initialization of the fixmap area */
+        movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
+        movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax
+        addl $0x007, %eax                       /* 0x007 = PRESENT+RW+USER */
+        movl %eax, 4092(%edx)
+#ifdef CONFIG_SMP
+ENTRY(startup_32_smp)
+        cld
+        movl $(__BOOT_DS),%eax
+        movl %eax,%ds
+        movl %eax,%es
+        movl %eax,%fs
+        movl %eax,%gs
+/*
+ *      New page tables may be in 4Mbyte page mode and may
+ *      be using the global pages. 
+ *
+ *      NOTE! If we are on a 486 we may have no cr4 at all!
+ *      So we do not try to touch it unless we really have
+ *      some bits in it to set.  This won't work if the BSP
+ *      implements cr4 but this AP does not -- very unlikely
+ *      but be warned!  The same applies to the pse feature
+ *      if not equally supported. --macro
+ *
+ *      NOTE! We have to correct for the fact that we're
+ *      not yet offset PAGE_OFFSET..
+ */
+#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
+        movl cr4_bits,%edx
+        andl %edx,%edx
+        jz 6f
+        movl %cr4,%eax          # Turn on paging options (PSE,PAE,..)
+        orl %edx,%eax
+        movl %eax,%cr4
+        btl $5, %eax            # check if PAE is enabled
+        jnc 6f
+        /* Check if extended functions are implemented */
+        movl $0x80000000, %eax
+        cpuid
+        cmpl $0x80000000, %eax
+        jbe 6f
+        mov $0x80000001, %eax
+        cpuid
+        /* Execute Disable bit supported? */
+        btl $20, %edx
+        jnc 6f
+        /* Setup EFER (Extended Feature Enable Register) */
+        movl $0xc0000080, %ecx
+        rdmsr
+        btsl $11, %eax
+        /* Make changes effective */
+        wrmsr
+6:
+        /* This is a secondary processor (AP) */
+        xorl %ebx,%ebx
+        incl %ebx
+#endif /* CONFIG_SMP */
+3:
+/*
+ * Enable paging
+ */
+        movl $swapper_pg_dir-__PAGE_OFFSET,%eax
+        movl %eax,%cr3          /* set the page table pointer.. */
+        movl %cr0,%eax
+        orl $0x80000000,%eax
+        movl %eax,%cr0          /* ..and set paging (PG) bit */
+        ljmp $__BOOT_CS,$1f     /* Clear prefetch and normalize %eip */
+1:
+        /* Set up the stack pointer */
+        lss stack_start,%esp
+/*
+ * Initialize eflags.  Some BIOS's leave bits like NT set.  This would
+ * confuse the debugger if this code is traced.
+ * XXX - best to initialize before switching to protected mode.
+ */
+        pushl $0
+        popfl
+#ifdef CONFIG_SMP
+        andl %ebx,%ebx
+        jz  1f                          /* Initial CPU cleans BSS */
+        jmp checkCPUtype
+1:
+#endif /* CONFIG_SMP */
+/*
+ * start system 32-bit setup. We need to re-do some of the things done
+ * in 16-bit mode for the "real" operations.
+ */
+        call setup_idt
+checkCPUtype:
+        movl $-1,X86_CPUID              #  -1 for no CPUID initially
+/* check if it is 486 or 386. */
+/*
+ * XXX - this does a lot of unnecessary setup.  Alignment checks don't
+ * apply at our cpl of 0 and the stack ought to be aligned already, and
+ * we don't need to preserve eflags.
+ */
+        movb $3,X86             # at least 386
+        pushfl                  # push EFLAGS
+        popl %eax               # get EFLAGS
+        movl %eax,%ecx          # save original EFLAGS
+        xorl $0x240000,%eax     # flip AC and ID bits in EFLAGS
+        pushl %eax              # copy to EFLAGS
+        popfl                   # set EFLAGS
+        pushfl                  # get new EFLAGS
+        popl %eax               # put it in eax
+        xorl %ecx,%eax          # change in flags
+        pushl %ecx              # restore original EFLAGS
+        popfl
+        testl $0x40000,%eax     # check if AC bit changed
+        je is386
+        movb $4,X86             # at least 486
+        testl $0x200000,%eax    # check if ID bit changed
+        je is486
+        /* get vendor info */
+        xorl %eax,%eax                  # call CPUID with 0 -> return vendor ID
+        cpuid
+        movl %eax,X86_CPUID             # save CPUID level
+        movl %ebx,X86_VENDOR_ID         # lo 4 chars
+        movl %edx,X86_VENDOR_ID+4       # next 4 chars
+        movl %ecx,X86_VENDOR_ID+8       # last 4 chars
+        orl %eax,%eax                   # do we have processor info as well?
+        je is486
+        movl $1,%eax            # Use the CPUID instruction to get CPU type
+        cpuid
+        movb %al,%cl            # save reg for future use
+        andb $0x0f,%ah          # mask processor family
+        movb %ah,X86
+        andb $0xf0,%al          # mask model
+        shrb $4,%al
+        movb %al,X86_MODEL
+        andb $0x0f,%cl          # mask mask revision
+        movb %cl,X86_MASK
+        movl %edx,X86_CAPABILITY
+is486:  movl $0x50022,%ecx      # set AM, WP, NE and MP
+        jmp 2f
+is386:  movl $2,%ecx            # set MP
+2:      movl %cr0,%eax
+        andl $0x80000011,%eax   # Save PG,PE,ET
+        orl %ecx,%eax
+        movl %eax,%cr0
+        call check_x87
+        lgdt early_gdt_descr
+        lidt idt_descr
+        ljmp $(__KERNEL_CS),$1f
+1:      movl $(__KERNEL_DS),%eax        # reload all the segment registers
+        movl %eax,%ss                   # after changing gdt.
+        movl %eax,%fs                   # gets reset once there's real percpu
+        movl $(__USER_DS),%eax          # DS/ES contains default USER segment
+        movl %eax,%ds
+        movl %eax,%es
+        xorl %eax,%eax                  # Clear GS and LDT
+        movl %eax,%gs
+        lldt %ax
+        cld                     # gcc2 wants the direction flag cleared at all times
+        pushl $0                # fake return address for unwinder
+#ifdef CONFIG_SMP
+        movb ready, %cl
+        movb $1, ready
+        cmpb $0,%cl             # the first CPU calls start_kernel
+        je   1f
+        movl $(__KERNEL_PERCPU), %eax
+        movl %eax,%fs           # set this cpu's percpu
+        jmp initialize_secondary # all other CPUs call initialize_secondary
+1:
+#endif /* CONFIG_SMP */
+        jmp start_kernel
+/*
+ * We depend on ET to be correct. This checks for 287/387.
+ */
+check_x87:
+        movb $0,X86_HARD_MATH
+        clts
+        fninit
+        fstsw %ax
+        cmpb $0,%al
+        je 1f
+        movl %cr0,%eax          /* no coprocessor: have to set bits */
+        xorl $4,%eax            /* set EM */
+        movl %eax,%cr0
+        ret
+        ALIGN
+1:      movb $1,X86_HARD_MATH
+        .byte 0xDB,0xE4         /* fsetpm for 287, ignored by 387 */
+        ret
+/*
+ *  setup_idt
+ *
+ *  sets up a idt with 256 entries pointing to
+ *  ignore_int, interrupt gates. It doesn't actually load
+ *  idt - that can be done only after paging has been enabled
+ *  and the kernel moved to PAGE_OFFSET. Interrupts
+ *  are enabled elsewhere, when we can be relatively
+ *  sure everything is ok.
+ *
+ *  Warning: %esi is live across this function.
+ */
+setup_idt:
+        lea ignore_int,%edx
+        movl $(__KERNEL_CS << 16),%eax
+        movw %dx,%ax            /* selector = 0x0010 = cs */
+        movw $0x8E00,%dx        /* interrupt gate - dpl=0, present */
+        lea idt_table,%edi
+        mov $256,%ecx
+rp_sidt:
+        movl %eax,(%edi)
+        movl %edx,4(%edi)
+        addl $8,%edi
+        dec %ecx
+        jne rp_sidt
+.macro  set_early_handler handler,trapno
+        lea \handler,%edx
+        movl $(__KERNEL_CS << 16),%eax
+        movw %dx,%ax
+        movw $0x8E00,%dx        /* interrupt gate - dpl=0, present */
+        lea idt_table,%edi
+        movl %eax,8*\trapno(%edi)
+        movl %edx,8*\trapno+4(%edi)
+.endm
+        set_early_handler handler=early_divide_err,trapno=0
+        set_early_handler handler=early_illegal_opcode,trapno=6
+        set_early_handler handler=early_protection_fault,trapno=13
+        set_early_handler handler=early_page_fault,trapno=14
+        ret
+early_divide_err:
+        xor %edx,%edx
+        pushl $0        /* fake errcode */
+        jmp early_fault
+early_illegal_opcode:
+        movl $6,%edx
+        pushl $0        /* fake errcode */
+        jmp early_fault
+early_protection_fault:
+        movl $13,%edx
+        jmp early_fault
+early_page_fault:
+        movl $14,%edx
+        jmp early_fault
+early_fault:
+        cld
+#ifdef CONFIG_PRINTK
+        movl $(__KERNEL_DS),%eax
+        movl %eax,%ds
+        movl %eax,%es
+        cmpl $2,early_recursion_flag
+        je hlt_loop
+        incl early_recursion_flag
+        movl %cr2,%eax
+        pushl %eax
+        pushl %edx              /* trapno */
+        pushl $fault_msg
+#ifdef CONFIG_EARLY_PRINTK
+        call early_printk
+#else
+        call printk
+#endif
+#endif
+hlt_loop:
+        hlt
+        jmp hlt_loop
+/* This is the default interrupt "handler" :-) */
+        ALIGN
+ignore_int:
+        cld
+#ifdef CONFIG_PRINTK
+        pushl %eax
+        pushl %ecx
+        pushl %edx
+        pushl %es
+        pushl %ds
+        movl $(__KERNEL_DS),%eax
+        movl %eax,%ds
+        movl %eax,%es
+        cmpl $2,early_recursion_flag
+        je hlt_loop
+        incl early_recursion_flag
+        pushl 16(%esp)
+        pushl 24(%esp)
+        pushl 32(%esp)
+        pushl 40(%esp)
+        pushl $int_msg
+#ifdef CONFIG_EARLY_PRINTK
+        call early_printk
+#else
+        call printk
+#endif
+        addl $(5*4),%esp
+        popl %ds
+        popl %es
+        popl %edx
+        popl %ecx
+        popl %eax
+#endif
+        iret
+.section .text
+/*
+ * Real beginning of normal "text" segment
+ */
+ENTRY(stext)
+ENTRY(_stext)
+/*
+ * BSS section
+ */
+.section ".bss.page_aligned","wa"
+        .align PAGE_SIZE_asm
+ENTRY(swapper_pg_dir)
+        .fill 1024,4,0
+ENTRY(swapper_pg_pmd)
+        .fill 1024,4,0
+ENTRY(empty_zero_page)
+        .fill 4096,1,0
+/*
+ * This starts the data section.
+ */
+.data
+ENTRY(stack_start)
+        .long init_thread_union+THREAD_SIZE
+        .long __BOOT_DS
+ready:  .byte 0
+early_recursion_flag:
+        .long 0
+int_msg:
+        .asciz "Unknown interrupt or fault at EIP %p %p %p\n"
+fault_msg:
+        .ascii "Int %d: CR2 %p  err %p  EIP %p  CS %p  flags %p\n"
+        .asciz "Stack: %p %p %p %p %p %p %p %p\n"
+#include "../../x86/xen/xen-head.S"
+/*
+ * The IDT and GDT 'descriptors' are a strange 48-bit object
+ * only used by the lidt and lgdt instructions. They are not
+ * like usual segment descriptors - they consist of a 16-bit
+ * segment size, and 32-bit linear address value:
+ */
+.globl boot_gdt_descr
+.globl idt_descr
+        ALIGN
+# early boot GDT descriptor (must use 1:1 address mapping)
+        .word 0                         # 32 bit align gdt_desc.address
+boot_gdt_descr:
+        .word __BOOT_DS+7
+        .long boot_gdt - __PAGE_OFFSET
+        .word 0                         # 32-bit align idt_desc.address
+idt_descr:
+        .word IDT_ENTRIES*8-1           # idt contains 256 entries
+        .long idt_table
+# boot GDT descriptor (later on used by CPU#0):
+        .word 0                         # 32 bit align gdt_desc.address
+ENTRY(early_gdt_descr)
+        .word GDT_ENTRIES*8-1
+        .long per_cpu__gdt_page         /* Overwritten for secondary CPUs */
+/*
+ * The boot_gdt must mirror the equivalent in setup.S and is
+ * used only for booting.
+ */
+        .align L1_CACHE_BYTES
+ENTRY(boot_gdt)
+        .fill GDT_ENTRY_BOOT_CS,8,0
+        .quad 0x00cf9a000000ffff        /* kernel 4GB code at 0x00000000 */
+        .quad 0x00cf92000000ffff        /* kernel 4GB data at 0x00000000 */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
new file mode 100644
index 000000000000..b6167fe3330e
--- /dev/null
+++ b/arch/x86/kernel/head_64.S
@@ -0,0 +1,416 @@
+/*
+ *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
+ *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
+ *  Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
+ */
+#include <linux/linkage.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <asm/desc.h>
+#include <asm/segment.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/cache.h>
+/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
+ * because we need identity-mapped pages.
+ *
+ */
+        .text
+        .section .text.head
+        .code64
+        .globl startup_64
+startup_64:
+        /*
+         * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
+         * and someone has loaded an identity mapped page table
+         * for us.  These identity mapped page tables map all of the
+         * kernel pages and possibly all of memory.
+         *
+         * %esi holds a physical pointer to real_mode_data.
+         *
+         * We come here either directly from a 64bit bootloader, or from
+         * arch/x86_64/boot/compressed/head.S.
+         *
+         * We only come here initially at boot nothing else comes here.
+         *
+         * Since we may be loaded at an address different from what we were
+         * compiled to run at we first fixup the physical addresses in our page
+         * tables and then reload them.
+         */
+        /* Compute the delta between the address I am compiled to run at and the
+         * address I am actually running at.
+         */
+        leaq    _text(%rip), %rbp
+        subq    $_text - __START_KERNEL_map, %rbp
+        /* Is the address not 2M aligned? */
+        movq    %rbp, %rax
+        andl    $~LARGE_PAGE_MASK, %eax
+        testl   %eax, %eax
+        jnz     bad_address
+        /* Is the address too large? */
+        leaq    _text(%rip), %rdx
+        movq    $PGDIR_SIZE, %rax
+        cmpq    %rax, %rdx
+        jae     bad_address
+        /* Fixup the physical addresses in the page table
+         */
+        addq    %rbp, init_level4_pgt + 0(%rip)
+        addq    %rbp, init_level4_pgt + (258*8)(%rip)
+        addq    %rbp, init_level4_pgt + (511*8)(%rip)
+        addq    %rbp, level3_ident_pgt + 0(%rip)
+        addq    %rbp, level3_kernel_pgt + (510*8)(%rip)
+        addq    %rbp, level3_kernel_pgt + (511*8)(%rip)
+        addq    %rbp, level2_fixmap_pgt + (506*8)(%rip)
+        /* Add an Identity mapping if I am above 1G */
+        leaq    _text(%rip), %rdi
+        andq    $LARGE_PAGE_MASK, %rdi
+        movq    %rdi, %rax
+        shrq    $PUD_SHIFT, %rax
+        andq    $(PTRS_PER_PUD - 1), %rax
+        jz      ident_complete
+        leaq    (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+        leaq    level3_ident_pgt(%rip), %rbx
+        movq    %rdx, 0(%rbx, %rax, 8)
+        movq    %rdi, %rax
+        shrq    $PMD_SHIFT, %rax
+        andq    $(PTRS_PER_PMD - 1), %rax
+        leaq    __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
+        leaq    level2_spare_pgt(%rip), %rbx
+        movq    %rdx, 0(%rbx, %rax, 8)
+ident_complete:
+        /* Fixup the kernel text+data virtual addresses
+         */
+        leaq    level2_kernel_pgt(%rip), %rdi
+        leaq    4096(%rdi), %r8
+        /* See if it is a valid page table entry */
+1:      testq   $1, 0(%rdi)
+        jz      2f
+        addq    %rbp, 0(%rdi)
+        /* Go to the next page */
+2:      addq    $8, %rdi
+        cmp     %r8, %rdi
+        jne     1b
+        /* Fixup phys_base */
+        addq    %rbp, phys_base(%rip)
+#ifdef CONFIG_SMP
+        addq    %rbp, trampoline_level4_pgt + 0(%rip)
+        addq    %rbp, trampoline_level4_pgt + (511*8)(%rip)
+#endif
+#ifdef CONFIG_ACPI_SLEEP
+        addq    %rbp, wakeup_level4_pgt + 0(%rip)
+        addq    %rbp, wakeup_level4_pgt + (511*8)(%rip)
+#endif
+        /* Due to ENTRY(), sometimes the empty space gets filled with
+         * zeros. Better take a jmp than relying on empty space being
+         * filled with 0x90 (nop)
+         */
+        jmp secondary_startup_64
+ENTRY(secondary_startup_64)
+        /*
+         * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
+         * and someone has loaded a mapped page table.
+         *
+         * %esi holds a physical pointer to real_mode_data.
+         *
+         * We come here either from startup_64 (using physical addresses)
+         * or from trampoline.S (using virtual addresses).
+         *
+         * Using virtual addresses from trampoline.S removes the need
+         * to have any identity mapped pages in the kernel page table
+         * after the boot processor executes this code.
+         */
+        /* Enable PAE mode and PGE */
+        xorq    %rax, %rax
+        btsq    $5, %rax
+        btsq    $7, %rax
+        movq    %rax, %cr4
+        /* Setup early boot stage 4 level pagetables. */
+        movq    $(init_level4_pgt - __START_KERNEL_map), %rax
+        addq    phys_base(%rip), %rax
+        movq    %rax, %cr3
+        /* Ensure I am executing from virtual addresses */
+        movq    $1f, %rax
+        jmp     *%rax
+1:
+        /* Check if nx is implemented */
+        movl    $0x80000001, %eax
+        cpuid
+        movl    %edx,%edi
+        /* Setup EFER (Extended Feature Enable Register) */
+        movl    $MSR_EFER, %ecx
+        rdmsr
+        btsl    $_EFER_SCE, %eax        /* Enable System Call */
+        btl     $20,%edi                /* No Execute supported? */
+        jnc     1f
+        btsl    $_EFER_NX, %eax
+1:      wrmsr                           /* Make changes effective */
+        /* Setup cr0 */
+#define CR0_PM                          1               /* protected mode */
+#define CR0_MP                          (1<<1)
+#define CR0_ET                          (1<<4)
+#define CR0_NE                          (1<<5)
+#define CR0_WP                          (1<<16)
+#define CR0_AM                          (1<<18)
+#define CR0_PAGING                      (1<<31)
+        movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
+        /* Make changes effective */
+        movq    %rax, %cr0
+        /* Setup a boot time stack */
+        movq init_rsp(%rip),%rsp
+        /* zero EFLAGS after setting rsp */
+        pushq $0
+        popfq
+        /*
+         * We must switch to a new descriptor in kernel space for the GDT
+         * because soon the kernel won't have access anymore to the userspace
+         * addresses where we're currently running on. We have to do that here
+         * because in 32bit we couldn't load a 64bit linear address.
+         */
+        lgdt    cpu_gdt_descr(%rip)
+        /* set up data segments. actually 0 would do too */
+        movl $__KERNEL_DS,%eax
+        movl %eax,%ds
+        movl %eax,%ss
+        movl %eax,%es
+        /*
+         * We don't really need to load %fs or %gs, but load them anyway
+         * to kill any stale realmode selectors.  This allows execution
+         * under VT hardware.
+         */
+        movl %eax,%fs
+        movl %eax,%gs
+        /* 
+         * Setup up a dummy PDA. this is just for some early bootup code
+         * that does in_interrupt() 
+         */ 
+        movl    $MSR_GS_BASE,%ecx
+        movq    $empty_zero_page,%rax
+        movq    %rax,%rdx
+        shrq    $32,%rdx
+        wrmsr   
+        /* esi is pointer to real mode structure with interesting info.
+           pass it to C */
+        movl    %esi, %edi
+        
+        /* Finally jump to run C code and to be on real kernel address
+         * Since we are running on identity-mapped space we have to jump
+         * to the full 64bit address, this is only possible as indirect
+         * jump.  In addition we need to ensure %cs is set so we make this
+         * a far return.
+         */
+        movq    initial_code(%rip),%rax
+        pushq   $0              # fake return address to stop unwinder
+        pushq   $__KERNEL_CS    # set correct cs
+        pushq   %rax            # target address in negative space
+        lretq
+        /* SMP bootup changes these two */
+#ifndef CONFIG_HOTPLUG_CPU
+        .pushsection .init.data
+#endif
+        .align  8
+        .globl  initial_code
+initial_code:
+        .quad   x86_64_start_kernel
+#ifndef CONFIG_HOTPLUG_CPU
+        .popsection
+#endif
+        .globl init_rsp
+init_rsp:
+        .quad  init_thread_union+THREAD_SIZE-8
+bad_address:
+        jmp bad_address
+ENTRY(early_idt_handler)
+        cmpl $2,early_recursion_flag(%rip)
+        jz  1f
+        incl early_recursion_flag(%rip)
+        xorl %eax,%eax
+        movq 8(%rsp),%rsi       # get rip
+        movq (%rsp),%rdx
+        movq %cr2,%rcx
+        leaq early_idt_msg(%rip),%rdi
+        call early_printk
+        cmpl $2,early_recursion_flag(%rip)
+        jz  1f
+        call dump_stack
+#ifdef CONFIG_KALLSYMS  
+        leaq early_idt_ripmsg(%rip),%rdi
+        movq 8(%rsp),%rsi       # get rip again
+        call __print_symbol
+#endif
+1:      hlt
+        jmp 1b
+early_recursion_flag:
+        .long 0
+early_idt_msg:
+        .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n"
+early_idt_ripmsg:
+        .asciz "RIP %s\n"
+.balign PAGE_SIZE
+#define NEXT_PAGE(name) \
+        .balign PAGE_SIZE; \
+ENTRY(name)
+/* Automate the creation of 1 to 1 mapping pmd entries */
+#define PMDS(START, PERM, COUNT)                \
+        i = 0 ;                                 \
+        .rept (COUNT) ;                         \
+        .quad   (START) + (i << 21) + (PERM) ;  \
+        i = i + 1 ;                             \
+        .endr
+        /*
+         * This default setting generates an ident mapping at address 0x100000
+         * and a mapping for the kernel that precisely maps virtual address
+         * 0xffffffff80000000 to physical address 0x000000. (always using
+         * 2Mbyte large pages provided by PAE mode)
+         */
+NEXT_PAGE(init_level4_pgt)
+        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+        .fill   257,8,0
+        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+        .fill   252,8,0
+        /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+NEXT_PAGE(level3_ident_pgt)
+        .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+        .fill   511,8,0
+NEXT_PAGE(level3_kernel_pgt)
+        .fill   510,8,0
+        /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
+        .quad   level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
+        .quad   level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+NEXT_PAGE(level2_fixmap_pgt)
+        .fill   506,8,0
+        .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+        /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
+        .fill   5,8,0
+NEXT_PAGE(level1_fixmap_pgt)
+        .fill   512,8,0
+NEXT_PAGE(level2_ident_pgt)
+        /* Since I easily can, map the first 1G.
+         * Don't set NX because code runs from these pages.
+         */
+        PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
+NEXT_PAGE(level2_kernel_pgt)
+        /* 40MB kernel mapping. The kernel code cannot be bigger than that.
+           When you change this change KERNEL_TEXT_SIZE in page.h too. */
+        /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
+        PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE)
+        /* Module mapping starts here */
+        .fill   (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
+NEXT_PAGE(level2_spare_pgt)
+        .fill   512,8,0
+#undef PMDS
+#undef NEXT_PAGE
+        .data
+        .align 16
+        .globl cpu_gdt_descr
+cpu_gdt_descr:
+        .word   gdt_end-cpu_gdt_table-1
+gdt:
+        .quad   cpu_gdt_table
+#ifdef CONFIG_SMP
+        .rept   NR_CPUS-1
+        .word   0
+        .quad   0
+        .endr
+#endif
+ENTRY(phys_base)
+        /* This must match the first entry in level2_kernel_pgt */
+        .quad   0x0000000000000000
+/* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types  kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout 
+ */
+                                
+        .section .data.page_aligned, "aw"
+        .align PAGE_SIZE
+/* The TLS descriptors are currently at a different place compared to i386.
+   Hopefully nobody expects them at a fixed place (Wine?) */
+        
+ENTRY(cpu_gdt_table)
+        .quad   0x0000000000000000      /* NULL descriptor */
+        .quad   0x00cf9b000000ffff      /* __KERNEL32_CS */
+        .quad   0x00af9b000000ffff      /* __KERNEL_CS */
+        .quad   0x00cf93000000ffff      /* __KERNEL_DS */
+        .quad   0x00cffb000000ffff      /* __USER32_CS */
+        .quad   0x00cff3000000ffff      /* __USER_DS, __USER32_DS  */
+        .quad   0x00affb000000ffff      /* __USER_CS */
+        .quad   0x0                     /* unused */
+        .quad   0,0                     /* TSS */
+        .quad   0,0                     /* LDT */
+        .quad   0,0,0                   /* three TLS descriptors */ 
+        .quad   0x0000f40000000000      /* node/CPU stored in limit */
+gdt_end:        
+        /* asm/segment.h:GDT_ENTRIES must match this */ 
+        /* This should be a multiple of the cache line size */
+        /* GDTs of other CPUs are now dynamically allocated */
+        /* zero the remaining page */
+        .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
+        .section .bss, "aw", @nobits
+        .align L1_CACHE_BYTES
+ENTRY(idt_table)
+        .skip 256 * 16
+        .section .bss.page_aligned, "aw", @nobits
+        .align PAGE_SIZE
+ENTRY(empty_zero_page)
+        .skip PAGE_SIZE
diff --git a/arch/x86/kernel/hpet_32.c b/arch/x86/kernel/hpet_32.c
new file mode 100644
index 000000000000..533d4932bc79
--- /dev/null
+++ b/arch/x86/kernel/hpet_32.c
@@ -0,0 +1,553 @@
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/errno.h>
+#include <linux/hpet.h>
+#include <linux/init.h>
+#include <linux/sysdev.h>
+#include <linux/pm.h>
+#include <linux/delay.h>
+#include <asm/hpet.h>
+#include <asm/io.h>
+extern struct clock_event_device *global_clock_event;
+#define HPET_MASK       CLOCKSOURCE_MASK(32)
+#define HPET_SHIFT      22
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC   1000000
+/*
+ * HPET address is set in acpi/boot.c, when an ACPI entry exists
+ */
+unsigned long hpet_address;
+static void __iomem * hpet_virt_address;
+static inline unsigned long hpet_readl(unsigned long a)
+{
+        return readl(hpet_virt_address + a);
+}
+static inline void hpet_writel(unsigned long d, unsigned long a)
+{
+        writel(d, hpet_virt_address + a);
+}
+/*
+ * HPET command line enable / disable
+ */
+static int boot_hpet_disable;
+static int __init hpet_setup(char* str)
+{
+        if (str) {
+                if (!strncmp("disable", str, 7))
+                        boot_hpet_disable = 1;
+        }
+        return 1;
+}
+__setup("hpet=", hpet_setup);
+static inline int is_hpet_capable(void)
+{
+        return (!boot_hpet_disable && hpet_address);
+}
+/*
+ * HPET timer interrupt enable / disable
+ */
+static int hpet_legacy_int_enabled;
+/**
+ * is_hpet_enabled - check whether the hpet timer interrupt is enabled
+ */
+int is_hpet_enabled(void)
+{
+        return is_hpet_capable() && hpet_legacy_int_enabled;
+}
+/*
+ * When the hpet driver (/dev/hpet) is enabled, we need to reserve
+ * timer 0 and timer 1 in case of RTC emulation.
+ */
+#ifdef CONFIG_HPET
+static void hpet_reserve_platform_timers(unsigned long id)
+{
+        struct hpet __iomem *hpet = hpet_virt_address;
+        struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
+        unsigned int nrtimers, i;
+        struct hpet_data hd;
+        nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+        memset(&hd, 0, sizeof (hd));
+        hd.hd_phys_address = hpet_address;
+        hd.hd_address = hpet_virt_address;
+        hd.hd_nirqs = nrtimers;
+        hd.hd_flags = HPET_DATA_PLATFORM;
+        hpet_reserve_timer(&hd, 0);
+#ifdef CONFIG_HPET_EMULATE_RTC
+        hpet_reserve_timer(&hd, 1);
+#endif
+        hd.hd_irq[0] = HPET_LEGACY_8254;
+        hd.hd_irq[1] = HPET_LEGACY_RTC;
+        for (i = 2; i < nrtimers; timer++, i++)
+                hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
+                        Tn_INT_ROUTE_CNF_SHIFT;
+        hpet_alloc(&hd);
+}
+#else
+static void hpet_reserve_platform_timers(unsigned long id) { }
+#endif
+/*
+ * Common hpet info
+ */
+static unsigned long hpet_period;
+static void hpet_set_mode(enum clock_event_mode mode,
+                          struct clock_event_device *evt);
+static int hpet_next_event(unsigned long delta,
+                           struct clock_event_device *evt);
+/*
+ * The hpet clock event device
+ */
+static struct clock_event_device hpet_clockevent = {
+        .name           = "hpet",
+        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+        .set_mode       = hpet_set_mode,
+        .set_next_event = hpet_next_event,
+        .shift          = 32,
+        .irq            = 0,
+};
+static void hpet_start_counter(void)
+{
+        unsigned long cfg = hpet_readl(HPET_CFG);
+        cfg &= ~HPET_CFG_ENABLE;
+        hpet_writel(cfg, HPET_CFG);
+        hpet_writel(0, HPET_COUNTER);
+        hpet_writel(0, HPET_COUNTER + 4);
+        cfg |= HPET_CFG_ENABLE;
+        hpet_writel(cfg, HPET_CFG);
+}
+static void hpet_enable_int(void)
+{
+        unsigned long cfg = hpet_readl(HPET_CFG);
+        cfg |= HPET_CFG_LEGACY;
+        hpet_writel(cfg, HPET_CFG);
+        hpet_legacy_int_enabled = 1;
+}
+static void hpet_set_mode(enum clock_event_mode mode,
+                          struct clock_event_device *evt)
+{
+        unsigned long cfg, cmp, now;
+        uint64_t delta;
+        switch(mode) {
+        case CLOCK_EVT_MODE_PERIODIC:
+                delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
+                delta >>= hpet_clockevent.shift;
+                now = hpet_readl(HPET_COUNTER);
+                cmp = now + (unsigned long) delta;
+                cfg = hpet_readl(HPET_T0_CFG);
+                cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
+                       HPET_TN_SETVAL | HPET_TN_32BIT;
+                hpet_writel(cfg, HPET_T0_CFG);
+                /*
+                 * The first write after writing TN_SETVAL to the
+                 * config register sets the counter value, the second
+                 * write sets the period.
+                 */
+                hpet_writel(cmp, HPET_T0_CMP);
+                udelay(1);
+                hpet_writel((unsigned long) delta, HPET_T0_CMP);
+                break;
+        case CLOCK_EVT_MODE_ONESHOT:
+                cfg = hpet_readl(HPET_T0_CFG);
+                cfg &= ~HPET_TN_PERIODIC;
+                cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+                hpet_writel(cfg, HPET_T0_CFG);
+                break;
+        case CLOCK_EVT_MODE_UNUSED:
+        case CLOCK_EVT_MODE_SHUTDOWN:
+                cfg = hpet_readl(HPET_T0_CFG);
+                cfg &= ~HPET_TN_ENABLE;
+                hpet_writel(cfg, HPET_T0_CFG);
+                break;
+        case CLOCK_EVT_MODE_RESUME:
+                hpet_enable_int();
+                break;
+        }
+}
+static int hpet_next_event(unsigned long delta,
+                           struct clock_event_device *evt)
+{
+        unsigned long cnt;
+        cnt = hpet_readl(HPET_COUNTER);
+        cnt += delta;
+        hpet_writel(cnt, HPET_T0_CMP);
+        return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0;
+}
+/*
+ * Clock source related code
+ */
+static cycle_t read_hpet(void)
+{
+        return (cycle_t)hpet_readl(HPET_COUNTER);
+}
+static struct clocksource clocksource_hpet = {
+        .name           = "hpet",
+        .rating         = 250,
+        .read           = read_hpet,
+        .mask           = HPET_MASK,
+        .shift          = HPET_SHIFT,
+        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
+        .resume         = hpet_start_counter,
+};
+/*
+ * Try to setup the HPET timer
+ */
+int __init hpet_enable(void)
+{
+        unsigned long id;
+        uint64_t hpet_freq;
+        u64 tmp, start, now;
+        cycle_t t1;
+        if (!is_hpet_capable())
+                return 0;
+        hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+        /*
+         * Read the period and check for a sane value:
+         */
+        hpet_period = hpet_readl(HPET_PERIOD);
+        if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
+                goto out_nohpet;
+        /*
+         * The period is a femto seconds value. We need to calculate the
+         * scaled math multiplication factor for nanosecond to hpet tick
+         * conversion.
+         */
+        hpet_freq = 1000000000000000ULL;
+        do_div(hpet_freq, hpet_period);
+        hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
+                                      NSEC_PER_SEC, 32);
+        /* Calculate the min / max delta */
+        hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
+                                                           &hpet_clockevent);
+        hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30,
+                                                           &hpet_clockevent);
+        /*
+         * Read the HPET ID register to retrieve the IRQ routing
+         * information and the number of channels
+         */
+        id = hpet_readl(HPET_ID);
+#ifdef CONFIG_HPET_EMULATE_RTC
+        /*
+         * The legacy routing mode needs at least two channels, tick timer
+         * and the rtc emulation channel.
+         */
+        if (!(id & HPET_ID_NUMBER))
+                goto out_nohpet;
+#endif
+        /* Start the counter */
+        hpet_start_counter();
+        /* Verify whether hpet counter works */
+        t1 = read_hpet();
+        rdtscll(start);
+        /*
+         * We don't know the TSC frequency yet, but waiting for
+         * 200000 TSC cycles is safe:
+         * 4 GHz == 50us
+         * 1 GHz == 200us
+         */
+        do {
+                rep_nop();
+                rdtscll(now);
+        } while ((now - start) < 200000UL);
+        if (t1 == read_hpet()) {
+                printk(KERN_WARNING
+                       "HPET counter not counting. HPET disabled\n");
+                goto out_nohpet;
+        }
+        /* Initialize and register HPET clocksource
+         *
+         * hpet period is in femto seconds per cycle
+         * so we need to convert this to ns/cyc units
+         * aproximated by mult/2^shift
+         *
+         *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+         *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+         *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+         *  (fsec/cyc << shift)/1000000 = mult
+         *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+         */
+        tmp = (u64)hpet_period << HPET_SHIFT;
+        do_div(tmp, FSEC_PER_NSEC);
+        clocksource_hpet.mult = (u32)tmp;
+        clocksource_register(&clocksource_hpet);
+        if (id & HPET_ID_LEGSUP) {
+                hpet_enable_int();
+                hpet_reserve_platform_timers(id);
+                /*
+                 * Start hpet with the boot cpu mask and make it
+                 * global after the IO_APIC has been initialized.
+                 */
+                hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+                clockevents_register_device(&hpet_clockevent);
+                global_clock_event = &hpet_clockevent;
+                return 1;
+        }
+        return 0;
+out_nohpet:
+        iounmap(hpet_virt_address);
+        hpet_virt_address = NULL;
+        boot_hpet_disable = 1;
+        return 0;
+}
+#ifdef CONFIG_HPET_EMULATE_RTC
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ *    is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/mc146818rtc.h>
+#include <linux/rtc.h>
+#define DEFAULT_RTC_INT_FREQ    64
+#define DEFAULT_RTC_SHIFT       6
+#define RTC_NUM_INTS            1
+static unsigned long hpet_rtc_flags;
+static unsigned long hpet_prev_update_sec;
+static struct rtc_time hpet_alarm_time;
+static unsigned long hpet_pie_count;
+static unsigned long hpet_t1_cmp;
+static unsigned long hpet_default_delta;
+static unsigned long hpet_pie_delta;
+static unsigned long hpet_pie_limit;
+/*
+ * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
+ * is not supported by all HPET implementations for timer 1.
+ *
+ * hpet_rtc_timer_init() is called when the rtc is initialized.
+ */
+int hpet_rtc_timer_init(void)
+{
+        unsigned long cfg, cnt, delta, flags;
+        if (!is_hpet_enabled())
+                return 0;
+        if (!hpet_default_delta) {
+                uint64_t clc;
+                clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+                clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
+                hpet_default_delta = (unsigned long) clc;
+        }
+        if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+                delta = hpet_default_delta;
+        else
+                delta = hpet_pie_delta;
+        local_irq_save(flags);
+        cnt = delta + hpet_readl(HPET_COUNTER);
+        hpet_writel(cnt, HPET_T1_CMP);
+        hpet_t1_cmp = cnt;
+        cfg = hpet_readl(HPET_T1_CFG);
+        cfg &= ~HPET_TN_PERIODIC;
+        cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+        hpet_writel(cfg, HPET_T1_CFG);
+        local_irq_restore(flags);
+        return 1;
+}
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+        if (!is_hpet_enabled())
+                return 0;
+        hpet_rtc_flags &= ~bit_mask;
+        return 1;
+}
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+        unsigned long oldbits = hpet_rtc_flags;
+        if (!is_hpet_enabled())
+                return 0;
+        hpet_rtc_flags |= bit_mask;
+        if (!oldbits)
+                hpet_rtc_timer_init();
+        return 1;
+}
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
+                        unsigned char sec)
+{
+        if (!is_hpet_enabled())
+                return 0;
+        hpet_alarm_time.tm_hour = hrs;
+        hpet_alarm_time.tm_min = min;
+        hpet_alarm_time.tm_sec = sec;
+        return 1;
+}
+int hpet_set_periodic_freq(unsigned long freq)
+{
+        uint64_t clc;
+        if (!is_hpet_enabled())
+                return 0;
+        if (freq <= DEFAULT_RTC_INT_FREQ)
+                hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
+        else {
+                clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+                do_div(clc, freq);
+                clc >>= hpet_clockevent.shift;
+                hpet_pie_delta = (unsigned long) clc;
+        }
+        return 1;
+}
+int hpet_rtc_dropped_irq(void)
+{
+        return is_hpet_enabled();
+}
+static void hpet_rtc_timer_reinit(void)
+{
+        unsigned long cfg, delta;
+        int lost_ints = -1;
+        if (unlikely(!hpet_rtc_flags)) {
+                cfg = hpet_readl(HPET_T1_CFG);
+                cfg &= ~HPET_TN_ENABLE;
+                hpet_writel(cfg, HPET_T1_CFG);
+                return;
+        }
+        if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+                delta = hpet_default_delta;
+        else
+                delta = hpet_pie_delta;
+        /*
+         * Increment the comparator value until we are ahead of the
+         * current count.
+         */
+        do {
+                hpet_t1_cmp += delta;
+                hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+                lost_ints++;
+        } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0);
+        if (lost_ints) {
+                if (hpet_rtc_flags & RTC_PIE)
+                        hpet_pie_count += lost_ints;
+                if (printk_ratelimit())
+                        printk(KERN_WARNING "rtc: lost %d interrupts\n",
+                                lost_ints);
+        }
+}
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
+{
+        struct rtc_time curr_time;
+        unsigned long rtc_int_flag = 0;
+        hpet_rtc_timer_reinit();
+        if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
+                rtc_get_rtc_time(&curr_time);
+        if (hpet_rtc_flags & RTC_UIE &&
+            curr_time.tm_sec != hpet_prev_update_sec) {
+                rtc_int_flag = RTC_UF;
+                hpet_prev_update_sec = curr_time.tm_sec;
+        }
+        if (hpet_rtc_flags & RTC_PIE &&
+            ++hpet_pie_count >= hpet_pie_limit) {
+                rtc_int_flag |= RTC_PF;
+                hpet_pie_count = 0;
+        }
+        if (hpet_rtc_flags & RTC_PIE &&
+            (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
+            (curr_time.tm_min == hpet_alarm_time.tm_min) &&
+            (curr_time.tm_hour == hpet_alarm_time.tm_hour))
+                        rtc_int_flag |= RTC_AF;
+        if (rtc_int_flag) {
+                rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+                rtc_interrupt(rtc_int_flag, dev_id);
+        }
+        return IRQ_HANDLED;
+}
+#endif
diff --git a/arch/x86/kernel/hpet_64.c b/arch/x86/kernel/hpet_64.c
new file mode 100644
index 000000000000..e2d1b912e154
--- /dev/null
+++ b/arch/x86/kernel/hpet_64.c
@@ -0,0 +1,493 @@
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/time.h>
+#include <linux/clocksource.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/hpet.h>
+#include <asm/pgtable.h>
+#include <asm/vsyscall.h>
+#include <asm/timex.h>
+#include <asm/hpet.h>
+#define HPET_MASK       0xFFFFFFFF
+#define HPET_SHIFT      22
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC   1000000
+int nohpet __initdata;
+unsigned long hpet_address;
+unsigned long hpet_period;      /* fsecs / HPET clock */
+unsigned long hpet_tick;        /* HPET clocks / interrupt */
+int hpet_use_timer;             /* Use counter of hpet for time keeping,
+                                 * otherwise PIT
+                                 */
+#ifdef  CONFIG_HPET
+static __init int late_hpet_init(void)
+{
+        struct hpet_data        hd;
+        unsigned int            ntimer;
+        if (!hpet_address)
+                return 0;
+        memset(&hd, 0, sizeof(hd));
+        ntimer = hpet_readl(HPET_ID);
+        ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+        ntimer++;
+        /*
+         * Register with driver.
+         * Timer0 and Timer1 is used by platform.
+         */
+        hd.hd_phys_address = hpet_address;
+        hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
+        hd.hd_nirqs = ntimer;
+        hd.hd_flags = HPET_DATA_PLATFORM;
+        hpet_reserve_timer(&hd, 0);
+#ifdef  CONFIG_HPET_EMULATE_RTC
+        hpet_reserve_timer(&hd, 1);
+#endif
+        hd.hd_irq[0] = HPET_LEGACY_8254;
+        hd.hd_irq[1] = HPET_LEGACY_RTC;
+        if (ntimer > 2) {
+                struct hpet             *hpet;
+                struct hpet_timer       *timer;
+                int                     i;
+                hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
+                timer = &hpet->hpet_timers[2];
+                for (i = 2; i < ntimer; timer++, i++)
+                        hd.hd_irq[i] = (timer->hpet_config &
+                                        Tn_INT_ROUTE_CNF_MASK) >>
+                                Tn_INT_ROUTE_CNF_SHIFT;
+        }
+        hpet_alloc(&hd);
+        return 0;
+}
+fs_initcall(late_hpet_init);
+#endif
+int hpet_timer_stop_set_go(unsigned long tick)
+{
+        unsigned int cfg;
+/*
+ * Stop the timers and reset the main counter.
+ */
+        cfg = hpet_readl(HPET_CFG);
+        cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+        hpet_writel(cfg, HPET_CFG);
+        hpet_writel(0, HPET_COUNTER);
+        hpet_writel(0, HPET_COUNTER + 4);
+/*
+ * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
+ * and period also hpet_tick.
+ */
+        if (hpet_use_timer) {
+                hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+                    HPET_TN_32BIT, HPET_T0_CFG);
+                hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
+                hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
+                cfg |= HPET_CFG_LEGACY;
+        }
+/*
+ * Go!
+ */
+        cfg |= HPET_CFG_ENABLE;
+        hpet_writel(cfg, HPET_CFG);
+        return 0;
+}
+static cycle_t read_hpet(void)
+{
+        return (cycle_t)hpet_readl(HPET_COUNTER);
+}
+static cycle_t __vsyscall_fn vread_hpet(void)
+{
+        return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+}
+struct clocksource clocksource_hpet = {
+        .name           = "hpet",
+        .rating         = 250,
+        .read           = read_hpet,
+        .mask           = (cycle_t)HPET_MASK,
+        .mult           = 0, /* set below */
+        .shift          = HPET_SHIFT,
+        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
+        .vread          = vread_hpet,
+};
+int __init hpet_arch_init(void)
+{
+        unsigned int id;
+        u64 tmp;
+        if (!hpet_address)
+                return -1;
+        set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
+        __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+/*
+ * Read the period, compute tick and quotient.
+ */
+        id = hpet_readl(HPET_ID);
+        if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
+                return -1;
+        hpet_period = hpet_readl(HPET_PERIOD);
+        if (hpet_period < 100000 || hpet_period > 100000000)
+                return -1;
+        hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
+        hpet_use_timer = (id & HPET_ID_LEGSUP);
+        /*
+         * hpet period is in femto seconds per cycle
+         * so we need to convert this to ns/cyc units
+         * aproximated by mult/2^shift
+         *
+         *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+         *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+         *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+         *  (fsec/cyc << shift)/1000000 = mult
+         *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+         */
+        tmp = (u64)hpet_period << HPET_SHIFT;
+        do_div(tmp, FSEC_PER_NSEC);
+        clocksource_hpet.mult = (u32)tmp;
+        clocksource_register(&clocksource_hpet);
+        return hpet_timer_stop_set_go(hpet_tick);
+}
+int hpet_reenable(void)
+{
+        return hpet_timer_stop_set_go(hpet_tick);
+}
+/*
+ * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
+ * it to the HPET timer of known frequency.
+ */
+#define TICK_COUNT 100000000
+#define SMI_THRESHOLD 50000
+#define MAX_TRIES  5
+/*
+ * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
+ * occurs between the reads of the hpet & TSC.
+ */
+static void __init read_hpet_tsc(int *hpet, int *tsc)
+{
+        int tsc1, tsc2, hpet1, i;
+        for (i = 0; i < MAX_TRIES; i++) {
+                tsc1 = get_cycles_sync();
+                hpet1 = hpet_readl(HPET_COUNTER);
+                tsc2 = get_cycles_sync();
+                if ((tsc2 - tsc1) < SMI_THRESHOLD)
+                        break;
+        }
+        *hpet = hpet1;
+        *tsc = tsc2;
+}
+unsigned int __init hpet_calibrate_tsc(void)
+{
+        int tsc_start, hpet_start;
+        int tsc_now, hpet_now;
+        unsigned long flags;
+        local_irq_save(flags);
+        read_hpet_tsc(&hpet_start, &tsc_start);
+        do {
+                local_irq_disable();
+                read_hpet_tsc(&hpet_now, &tsc_now);
+                local_irq_restore(flags);
+        } while ((tsc_now - tsc_start) < TICK_COUNT &&
+                (hpet_now - hpet_start) < TICK_COUNT);
+        return (tsc_now - tsc_start) * 1000000000L
+                / ((hpet_now - hpet_start) * hpet_period / 1000);
+}
+#ifdef CONFIG_HPET_EMULATE_RTC
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ *    is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/rtc.h>
+#define DEFAULT_RTC_INT_FREQ    64
+#define RTC_NUM_INTS            1
+static unsigned long UIE_on;
+static unsigned long prev_update_sec;
+static unsigned long AIE_on;
+static struct rtc_time alarm_time;
+static unsigned long PIE_on;
+static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
+static unsigned long PIE_count;
+static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
+static unsigned int hpet_t1_cmp; /* cached comparator register */
+int is_hpet_enabled(void)
+{
+        return hpet_address != 0;
+}
+/*
+ * Timer 1 for RTC, we do not use periodic interrupt feature,
+ * even if HPET supports periodic interrupts on Timer 1.
+ * The reason being, to set up a periodic interrupt in HPET, we need to
+ * stop the main counter. And if we do that everytime someone diables/enables
+ * RTC, we will have adverse effect on main kernel timer running on Timer 0.
+ * So, for the time being, simulate the periodic interrupt in software.
+ *
+ * hpet_rtc_timer_init() is called for the first time and during subsequent
+ * interuppts reinit happens through hpet_rtc_timer_reinit().
+ */
+int hpet_rtc_timer_init(void)
+{
+        unsigned int cfg, cnt;
+        unsigned long flags;
+        if (!is_hpet_enabled())
+                return 0;
+        /*
+         * Set the counter 1 and enable the interrupts.
+         */
+        if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+                hpet_rtc_int_freq = PIE_freq;
+        else
+                hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+        local_irq_save(flags);
+        cnt = hpet_readl(HPET_COUNTER);
+        cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
+        hpet_writel(cnt, HPET_T1_CMP);
+        hpet_t1_cmp = cnt;
+        cfg = hpet_readl(HPET_T1_CFG);
+        cfg &= ~HPET_TN_PERIODIC;
+        cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+        hpet_writel(cfg, HPET_T1_CFG);
+        local_irq_restore(flags);
+        return 1;
+}
+static void hpet_rtc_timer_reinit(void)
+{
+        unsigned int cfg, cnt, ticks_per_int, lost_ints;
+        if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
+                cfg = hpet_readl(HPET_T1_CFG);
+                cfg &= ~HPET_TN_ENABLE;
+                hpet_writel(cfg, HPET_T1_CFG);
+                return;
+        }
+        if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+                hpet_rtc_int_freq = PIE_freq;
+        else
+                hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+        /* It is more accurate to use the comparator value than current count.*/
+        ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
+        hpet_t1_cmp += ticks_per_int;
+        hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+        /*
+         * If the interrupt handler was delayed too long, the write above tries
+         * to schedule the next interrupt in the past and the hardware would
+         * not interrupt until the counter had wrapped around.
+         * So we have to check that the comparator wasn't set to a past time.
+         */
+        cnt = hpet_readl(HPET_COUNTER);
+        if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
+                lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
+                /* Make sure that, even with the time needed to execute
+                 * this code, the next scheduled interrupt has been moved
+                 * back to the future: */
+                lost_ints++;
+                hpet_t1_cmp += lost_ints * ticks_per_int;
+                hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+                if (PIE_on)
+                        PIE_count += lost_ints;
+                if (printk_ratelimit())
+                        printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+                               hpet_rtc_int_freq);
+        }
+}
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+        if (!is_hpet_enabled())
+                return 0;
+        if (bit_mask & RTC_UIE)
+                UIE_on = 0;
+        if (bit_mask & RTC_PIE)
+                PIE_on = 0;
+        if (bit_mask & RTC_AIE)
+                AIE_on = 0;
+        return 1;
+}
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+        int timer_init_reqd = 0;
+        if (!is_hpet_enabled())
+                return 0;
+        if (!(PIE_on | AIE_on | UIE_on))
+                timer_init_reqd = 1;
+        if (bit_mask & RTC_UIE) {
+                UIE_on = 1;
+        }
+        if (bit_mask & RTC_PIE) {
+                PIE_on = 1;
+                PIE_count = 0;
+        }
+        if (bit_mask & RTC_AIE) {
+                AIE_on = 1;
+        }
+        if (timer_init_reqd)
+                hpet_rtc_timer_init();
+        return 1;
+}
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
+{
+        if (!is_hpet_enabled())
+                return 0;
+        alarm_time.tm_hour = hrs;
+        alarm_time.tm_min = min;
+        alarm_time.tm_sec = sec;
+        return 1;
+}
+int hpet_set_periodic_freq(unsigned long freq)
+{
+        if (!is_hpet_enabled())
+                return 0;
+        PIE_freq = freq;
+        PIE_count = 0;
+        return 1;
+}
+int hpet_rtc_dropped_irq(void)
+{
+        if (!is_hpet_enabled())
+                return 0;
+        return 1;
+}
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
+{
+        struct rtc_time curr_time;
+        unsigned long rtc_int_flag = 0;
+        int call_rtc_interrupt = 0;
+        hpet_rtc_timer_reinit();
+        if (UIE_on | AIE_on) {
+                rtc_get_rtc_time(&curr_time);
+        }
+        if (UIE_on) {
+                if (curr_time.tm_sec != prev_update_sec) {
+                        /* Set update int info, call real rtc int routine */
+                        call_rtc_interrupt = 1;
+                        rtc_int_flag = RTC_UF;
+                        prev_update_sec = curr_time.tm_sec;
+                }
+        }
+        if (PIE_on) {
+                PIE_count++;
+                if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
+                        /* Set periodic int info, call real rtc int routine */
+                        call_rtc_interrupt = 1;
+                        rtc_int_flag |= RTC_PF;
+                        PIE_count = 0;
+                }
+        }
+        if (AIE_on) {
+                if ((curr_time.tm_sec == alarm_time.tm_sec) &&
+                    (curr_time.tm_min == alarm_time.tm_min) &&
+                    (curr_time.tm_hour == alarm_time.tm_hour)) {
+                        /* Set alarm int info, call real rtc int routine */
+                        call_rtc_interrupt = 1;
+                        rtc_int_flag |= RTC_AF;
+                }
+        }
+        if (call_rtc_interrupt) {
+                rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+                rtc_interrupt(rtc_int_flag, dev_id);
+        }
+        return IRQ_HANDLED;
+}
+#endif
+static int __init nohpet_setup(char *s)
+{
+        nohpet = 1;
+        return 1;
+}
+__setup("nohpet", nohpet_setup);
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
new file mode 100644
index 000000000000..e3d4b73bfdb0
--- /dev/null
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -0,0 +1,30 @@
+#include <linux/module.h>
+#include <asm/checksum.h>
+#include <asm/desc.h>
+EXPORT_SYMBOL(__down_failed);
+EXPORT_SYMBOL(__down_failed_interruptible);
+EXPORT_SYMBOL(__down_failed_trylock);
+EXPORT_SYMBOL(__up_wakeup);
+/* Networking helper routines. */
+EXPORT_SYMBOL(csum_partial_copy_generic);
+EXPORT_SYMBOL(__get_user_1);
+EXPORT_SYMBOL(__get_user_2);
+EXPORT_SYMBOL(__get_user_4);
+EXPORT_SYMBOL(__put_user_1);
+EXPORT_SYMBOL(__put_user_2);
+EXPORT_SYMBOL(__put_user_4);
+EXPORT_SYMBOL(__put_user_8);
+EXPORT_SYMBOL(strstr);
+#ifdef CONFIG_SMP
+extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
+extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
+EXPORT_SYMBOL(__write_lock_failed);
+EXPORT_SYMBOL(__read_lock_failed);
+#endif
+EXPORT_SYMBOL(csum_partial);
diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c
new file mode 100644
index 000000000000..665847281ed2
--- /dev/null
+++ b/arch/x86/kernel/i387_32.c
@@ -0,0 +1,546 @@
+/*
+ *  linux/arch/i386/kernel/i387.c
+ *
+ *  Copyright (C) 1994 Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *      Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/math_emu.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+#ifdef CONFIG_MATH_EMULATION
+#define HAVE_HWFP (boot_cpu_data.hard_math)
+#else
+#define HAVE_HWFP 1
+#endif
+static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
+void mxcsr_feature_mask_init(void)
+{
+        unsigned long mask = 0;
+        clts();
+        if (cpu_has_fxsr) {
+                memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+                asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); 
+                mask = current->thread.i387.fxsave.mxcsr_mask;
+                if (mask == 0) mask = 0x0000ffbf;
+        } 
+        mxcsr_feature_mask &= mask;
+        stts();
+}
+/*
+ * The _current_ task is using the FPU for the first time
+ * so initialize it and set the mxcsr to its default
+ * value at reset if we support XMM instructions and then
+ * remeber the current task has used the FPU.
+ */
+void init_fpu(struct task_struct *tsk)
+{
+        if (cpu_has_fxsr) {
+                memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+                tsk->thread.i387.fxsave.cwd = 0x37f;
+                if (cpu_has_xmm)
+                        tsk->thread.i387.fxsave.mxcsr = 0x1f80;
+        } else {
+                memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct));
+                tsk->thread.i387.fsave.cwd = 0xffff037fu;
+                tsk->thread.i387.fsave.swd = 0xffff0000u;
+                tsk->thread.i387.fsave.twd = 0xffffffffu;
+                tsk->thread.i387.fsave.fos = 0xffff0000u;
+        }
+        /* only the device not available exception or ptrace can call init_fpu */
+        set_stopped_child_used_math(tsk);
+}
+/*
+ * FPU lazy state save handling.
+ */
+void kernel_fpu_begin(void)
+{
+        struct thread_info *thread = current_thread_info();
+        preempt_disable();
+        if (thread->status & TS_USEDFPU) {
+                __save_init_fpu(thread->task);
+                return;
+        }
+        clts();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+/*
+ * FPU tag word conversions.
+ */
+static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
+{
+        unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+ 
+        /* Transform each pair of bits into 01 (valid) or 00 (empty) */
+        tmp = ~twd;
+        tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+        /* and move the valid bits to the lower byte. */
+        tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+        tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+        tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+        return tmp;
+}
+static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
+{
+        struct _fpxreg *st = NULL;
+        unsigned long tos = (fxsave->swd >> 11) & 7;
+        unsigned long twd = (unsigned long) fxsave->twd;
+        unsigned long tag;
+        unsigned long ret = 0xffff0000u;
+        int i;
+#define FPREG_ADDR(f, n)        ((void *)&(f)->st_space + (n) * 16);
+        for ( i = 0 ; i < 8 ; i++ ) {
+                if ( twd & 0x1 ) {
+                        st = FPREG_ADDR( fxsave, (i - tos) & 7 );
+                        switch ( st->exponent & 0x7fff ) {
+                        case 0x7fff:
+                                tag = 2;                /* Special */
+                                break;
+                        case 0x0000:
+                                if ( !st->significand[0] &&
+                                     !st->significand[1] &&
+                                     !st->significand[2] &&
+                                     !st->significand[3] ) {
+                                        tag = 1;        /* Zero */
+                                } else {
+                                        tag = 2;        /* Special */
+                                }
+                                break;
+                        default:
+                                if ( st->significand[3] & 0x8000 ) {
+                                        tag = 0;        /* Valid */
+                                } else {
+                                        tag = 2;        /* Special */
+                                }
+                                break;
+                        }
+                } else {
+                        tag = 3;                        /* Empty */
+                }
+                ret |= (tag << (2 * i));
+                twd = twd >> 1;
+        }
+        return ret;
+}
+/*
+ * FPU state interaction.
+ */
+unsigned short get_fpu_cwd( struct task_struct *tsk )
+{
+        if ( cpu_has_fxsr ) {
+                return tsk->thread.i387.fxsave.cwd;
+        } else {
+                return (unsigned short)tsk->thread.i387.fsave.cwd;
+        }
+}
+unsigned short get_fpu_swd( struct task_struct *tsk )
+{
+        if ( cpu_has_fxsr ) {
+                return tsk->thread.i387.fxsave.swd;
+        } else {
+                return (unsigned short)tsk->thread.i387.fsave.swd;
+        }
+}
+#if 0
+unsigned short get_fpu_twd( struct task_struct *tsk )
+{
+        if ( cpu_has_fxsr ) {
+                return tsk->thread.i387.fxsave.twd;
+        } else {
+                return (unsigned short)tsk->thread.i387.fsave.twd;
+        }
+}
+#endif  /*  0  */
+unsigned short get_fpu_mxcsr( struct task_struct *tsk )
+{
+        if ( cpu_has_xmm ) {
+                return tsk->thread.i387.fxsave.mxcsr;
+        } else {
+                return 0x1f80;
+        }
+}
+#if 0
+void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
+{
+        if ( cpu_has_fxsr ) {
+                tsk->thread.i387.fxsave.cwd = cwd;
+        } else {
+                tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
+        }
+}
+void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
+{
+        if ( cpu_has_fxsr ) {
+                tsk->thread.i387.fxsave.swd = swd;
+        } else {
+                tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
+        }
+}
+void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
+{
+        if ( cpu_has_fxsr ) {
+                tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
+        } else {
+                tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
+        }
+}
+#endif  /*  0  */
+/*
+ * FXSR floating point environment conversions.
+ */
+static int convert_fxsr_to_user( struct _fpstate __user *buf,
+                                        struct i387_fxsave_struct *fxsave )
+{
+        unsigned long env[7];
+        struct _fpreg __user *to;
+        struct _fpxreg *from;
+        int i;
+        env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
+        env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
+        env[2] = twd_fxsr_to_i387(fxsave);
+        env[3] = fxsave->fip;
+        env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
+        env[5] = fxsave->foo;
+        env[6] = fxsave->fos;
+        if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
+                return 1;
+        to = &buf->_st[0];
+        from = (struct _fpxreg *) &fxsave->st_space[0];
+        for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
+                unsigned long __user *t = (unsigned long __user *)to;
+                unsigned long *f = (unsigned long *)from;
+                if (__put_user(*f, t) ||
+                                __put_user(*(f + 1), t + 1) ||
+                                __put_user(from->exponent, &to->exponent))
+                        return 1;
+        }
+        return 0;
+}
+static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
+                                          struct _fpstate __user *buf )
+{
+        unsigned long env[7];
+        struct _fpxreg *to;
+        struct _fpreg __user *from;
+        int i;
+        if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
+                return 1;
+        fxsave->cwd = (unsigned short)(env[0] & 0xffff);
+        fxsave->swd = (unsigned short)(env[1] & 0xffff);
+        fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
+        fxsave->fip = env[3];
+        fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
+        fxsave->fcs = (env[4] & 0xffff);
+        fxsave->foo = env[5];
+        fxsave->fos = env[6];
+        to = (struct _fpxreg *) &fxsave->st_space[0];
+        from = &buf->_st[0];
+        for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
+                unsigned long *t = (unsigned long *)to;
+                unsigned long __user *f = (unsigned long __user *)from;
+                if (__get_user(*t, f) ||
+                                __get_user(*(t + 1), f + 1) ||
+                                __get_user(to->exponent, &from->exponent))
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * Signal frame handlers.
+ */
+static inline int save_i387_fsave( struct _fpstate __user *buf )
+{
+        struct task_struct *tsk = current;
+        unlazy_fpu( tsk );
+        tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
+        if ( __copy_to_user( buf, &tsk->thread.i387.fsave,
+                             sizeof(struct i387_fsave_struct) ) )
+                return -1;
+        return 1;
+}
+static int save_i387_fxsave( struct _fpstate __user *buf )
+{
+        struct task_struct *tsk = current;
+        int err = 0;
+        unlazy_fpu( tsk );
+        if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) )
+                return -1;
+        err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status );
+        err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
+        if ( err )
+                return -1;
+        if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
+                             sizeof(struct i387_fxsave_struct) ) )
+                return -1;
+        return 1;
+}
+int save_i387( struct _fpstate __user *buf )
+{
+        if ( !used_math() )
+                return 0;
+        /* This will cause a "finit" to be triggered by the next
+         * attempted FPU operation by the 'current' process.
+         */
+        clear_used_math();
+        if ( HAVE_HWFP ) {
+                if ( cpu_has_fxsr ) {
+                        return save_i387_fxsave( buf );
+                } else {
+                        return save_i387_fsave( buf );
+                }
+        } else {
+                return save_i387_soft( &current->thread.i387.soft, buf );
+        }
+}
+static inline int restore_i387_fsave( struct _fpstate __user *buf )
+{
+        struct task_struct *tsk = current;
+        clear_fpu( tsk );
+        return __copy_from_user( &tsk->thread.i387.fsave, buf,
+                                 sizeof(struct i387_fsave_struct) );
+}
+static int restore_i387_fxsave( struct _fpstate __user *buf )
+{
+        int err;
+        struct task_struct *tsk = current;
+        clear_fpu( tsk );
+        err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
+                                sizeof(struct i387_fxsave_struct) );
+        /* mxcsr reserved bits must be masked to zero for security reasons */
+        tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+        return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf );
+}
+int restore_i387( struct _fpstate __user *buf )
+{
+        int err;
+        if ( HAVE_HWFP ) {
+                if ( cpu_has_fxsr ) {
+                        err = restore_i387_fxsave( buf );
+                } else {
+                        err = restore_i387_fsave( buf );
+                }
+        } else {
+                err = restore_i387_soft( &current->thread.i387.soft, buf );
+        }
+        set_used_math();
+        return err;
+}
+/*
+ * ptrace request handlers.
+ */
+static inline int get_fpregs_fsave( struct user_i387_struct __user *buf,
+                                    struct task_struct *tsk )
+{
+        return __copy_to_user( buf, &tsk->thread.i387.fsave,
+                               sizeof(struct user_i387_struct) );
+}
+static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf,
+                                     struct task_struct *tsk )
+{
+        return convert_fxsr_to_user( (struct _fpstate __user *)buf,
+                                     &tsk->thread.i387.fxsave );
+}
+int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk )
+{
+        if ( HAVE_HWFP ) {
+                if ( cpu_has_fxsr ) {
+                        return get_fpregs_fxsave( buf, tsk );
+                } else {
+                        return get_fpregs_fsave( buf, tsk );
+                }
+        } else {
+                return save_i387_soft( &tsk->thread.i387.soft,
+                                       (struct _fpstate __user *)buf );
+        }
+}
+static inline int set_fpregs_fsave( struct task_struct *tsk,
+                                    struct user_i387_struct __user *buf )
+{
+        return __copy_from_user( &tsk->thread.i387.fsave, buf,
+                                 sizeof(struct user_i387_struct) );
+}
+static inline int set_fpregs_fxsave( struct task_struct *tsk,
+                                     struct user_i387_struct __user *buf )
+{
+        return convert_fxsr_from_user( &tsk->thread.i387.fxsave,
+                                       (struct _fpstate __user *)buf );
+}
+int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf )
+{
+        if ( HAVE_HWFP ) {
+                if ( cpu_has_fxsr ) {
+                        return set_fpregs_fxsave( tsk, buf );
+                } else {
+                        return set_fpregs_fsave( tsk, buf );
+                }
+        } else {
+                return restore_i387_soft( &tsk->thread.i387.soft,
+                                          (struct _fpstate __user *)buf );
+        }
+}
+int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
+{
+        if ( cpu_has_fxsr ) {
+                if (__copy_to_user( buf, &tsk->thread.i387.fxsave,
+                                    sizeof(struct user_fxsr_struct) ))
+                        return -EFAULT;
+                return 0;
+        } else {
+                return -EIO;
+        }
+}
+int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
+{
+        int ret = 0;
+        if ( cpu_has_fxsr ) {
+                if (__copy_from_user( &tsk->thread.i387.fxsave, buf,
+                                  sizeof(struct user_fxsr_struct) ))
+                        ret = -EFAULT;
+                /* mxcsr reserved bits must be masked to zero for security reasons */
+                tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+        } else {
+                ret = -EIO;
+        }
+        return ret;
+}
+/*
+ * FPU state for core dumps.
+ */
+static inline void copy_fpu_fsave( struct task_struct *tsk,
+                                   struct user_i387_struct *fpu )
+{
+        memcpy( fpu, &tsk->thread.i387.fsave,
+                sizeof(struct user_i387_struct) );
+}
+static inline void copy_fpu_fxsave( struct task_struct *tsk,
+                                   struct user_i387_struct *fpu )
+{
+        unsigned short *to;
+        unsigned short *from;
+        int i;
+        memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) );
+        to = (unsigned short *)&fpu->st_space[0];
+        from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
+        for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
+                memcpy( to, from, 5 * sizeof(unsigned short) );
+        }
+}
+int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
+{
+        int fpvalid;
+        struct task_struct *tsk = current;
+        fpvalid = !!used_math();
+        if ( fpvalid ) {
+                unlazy_fpu( tsk );
+                if ( cpu_has_fxsr ) {
+                        copy_fpu_fxsave( tsk, fpu );
+                } else {
+                        copy_fpu_fsave( tsk, fpu );
+                }
+        }
+        return fpvalid;
+}
+EXPORT_SYMBOL(dump_fpu);
+int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
+{
+        int fpvalid = !!tsk_used_math(tsk);
+        if (fpvalid) {
+                if (tsk == current)
+                        unlazy_fpu(tsk);
+                if (cpu_has_fxsr)
+                        copy_fpu_fxsave(tsk, fpu);
+                else
+                        copy_fpu_fsave(tsk, fpu);
+        }
+        return fpvalid;
+}
+int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
+{
+        int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
+        if (fpvalid) {
+                if (tsk == current)
+                       unlazy_fpu(tsk);
+                memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
+        }
+        return fpvalid;
+}
diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c
new file mode 100644
index 000000000000..1d58c13bc6bc
--- /dev/null
+++ b/arch/x86/kernel/i387_64.c
@@ -0,0 +1,151 @@
+/*
+ *  linux/arch/x86_64/kernel/i387.c
+ *
+ *  Copyright (C) 1994 Linus Torvalds
+ *  Copyright (C) 2002 Andi Kleen, SuSE Labs
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *      Gareth Hughes <gareth@valinux.com>, May 2000
+ * 
+ *  x86-64 rework 2002 Andi Kleen. 
+ *  Does direct fxsave in and out of user space now for signal handlers.
+ *  All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
+ *  the 64bit user space sees a FXSAVE frame directly. 
+ */
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff;
+void mxcsr_feature_mask_init(void)
+{
+        unsigned int mask;
+        clts();
+        memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+        asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
+        mask = current->thread.i387.fxsave.mxcsr_mask;
+        if (mask == 0) mask = 0x0000ffbf;
+        mxcsr_feature_mask &= mask;
+        stts();
+}
+/*
+ * Called at bootup to set up the initial FPU state that is later cloned
+ * into all processes.
+ */
+void __cpuinit fpu_init(void)
+{
+        unsigned long oldcr0 = read_cr0();
+        extern void __bad_fxsave_alignment(void);
+                
+        if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
+                __bad_fxsave_alignment();
+        set_in_cr4(X86_CR4_OSFXSR);
+        set_in_cr4(X86_CR4_OSXMMEXCPT);
+        write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
+        mxcsr_feature_mask_init();
+        /* clean state in init */
+        current_thread_info()->status = 0;
+        clear_used_math();
+}
+void init_fpu(struct task_struct *child)
+{
+        if (tsk_used_math(child)) {
+                if (child == current)
+                        unlazy_fpu(child);
+                return;
+        }       
+        memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+        child->thread.i387.fxsave.cwd = 0x37f;
+        child->thread.i387.fxsave.mxcsr = 0x1f80;
+        /* only the device not available exception or ptrace can call init_fpu */
+        set_stopped_child_used_math(child);
+}
+/*
+ * Signal frame handlers.
+ */
+int save_i387(struct _fpstate __user *buf)
+{
+        struct task_struct *tsk = current;
+        int err = 0;
+        BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
+                        sizeof(tsk->thread.i387.fxsave));
+        if ((unsigned long)buf % 16) 
+                printk("save_i387: bad fpstate %p\n",buf); 
+        if (!used_math())
+                return 0;
+        clear_used_math(); /* trigger finit */
+        if (task_thread_info(tsk)->status & TS_USEDFPU) {
+                err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
+                if (err) return err;
+                stts();
+                } else {
+                if (__copy_to_user(buf, &tsk->thread.i387.fxsave, 
+                                   sizeof(struct i387_fxsave_struct)))
+                        return -1;
+        } 
+                return 1;
+}
+/*
+ * ptrace request handlers.
+ */
+int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
+{
+        init_fpu(tsk);
+        return __copy_to_user(buf, &tsk->thread.i387.fxsave,
+                               sizeof(struct user_i387_struct)) ? -EFAULT : 0;
+}
+int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
+{
+        if (__copy_from_user(&tsk->thread.i387.fxsave, buf, 
+                             sizeof(struct user_i387_struct)))
+                return -EFAULT;
+                return 0;
+}
+/*
+ * FPU state for core dumps.
+ */
+int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
+{
+        struct task_struct *tsk = current;
+        if (!used_math())
+                return 0;
+        unlazy_fpu(tsk);
+        memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); 
+        return 1; 
+}
+int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
+{
+        int fpvalid = !!tsk_used_math(tsk);
+        if (fpvalid) {
+                if (tsk == current)
+                        unlazy_fpu(tsk);
+                memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));         
+}
+        return fpvalid;
+}
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
new file mode 100644
index 000000000000..6f508e8d7c57
--- /dev/null
+++ b/arch/x86/kernel/i8237.c
@@ -0,0 +1,72 @@
+/*
+ * i8237.c: 8237A DMA controller suspend functions.
+ *
+ * Written by Pierre Ossman, 2005.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+#include <linux/init.h>
+#include <linux/sysdev.h>
+#include <asm/dma.h>
+/*
+ * This module just handles suspend/resume issues with the
+ * 8237A DMA controller (used for ISA and LPC).
+ * Allocation is handled in kernel/dma.c and normal usage is
+ * in asm/dma.h.
+ */
+static int i8237A_resume(struct sys_device *dev)
+{
+        unsigned long flags;
+        int i;
+        flags = claim_dma_lock();
+        dma_outb(DMA1_RESET_REG, 0);
+        dma_outb(DMA2_RESET_REG, 0);
+        for (i = 0;i < 8;i++) {
+                set_dma_addr(i, 0x000000);
+                /* DMA count is a bit weird so this is not 0 */
+                set_dma_count(i, 1);
+        }
+        /* Enable cascade DMA or channel 0-3 won't work */
+        enable_dma(4);
+        release_dma_lock(flags);
+        return 0;
+}
+static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
+{
+        return 0;
+}
+static struct sysdev_class i8237_sysdev_class = {
+        set_kset_name("i8237"),
+        .suspend = i8237A_suspend,
+        .resume = i8237A_resume,
+};
+static struct sys_device device_i8237A = {
+        .id     = 0,
+        .cls    = &i8237_sysdev_class,
+};
+static int __init i8237A_init_sysfs(void)
+{
+        int error = sysdev_class_register(&i8237_sysdev_class);
+        if (!error)
+                error = sysdev_register(&device_i8237A);
+        return error;
+}
+device_initcall(i8237A_init_sysfs);
diff --git a/arch/x86/kernel/i8253_32.c b/arch/x86/kernel/i8253_32.c
new file mode 100644
index 000000000000..6d839f2f1b1a
--- /dev/null
+++ b/arch/x86/kernel/i8253_32.c
@@ -0,0 +1,206 @@
+/*
+ * i8253.c  8253/PIT functions
+ *
+ */
+#include <linux/clockchips.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/smp.h>
+#include <asm/delay.h>
+#include <asm/i8253.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+DEFINE_SPINLOCK(i8253_lock);
+EXPORT_SYMBOL(i8253_lock);
+/*
+ * HPET replaces the PIT, when enabled. So we need to know, which of
+ * the two timers is used
+ */
+struct clock_event_device *global_clock_event;
+/*
+ * Initialize the PIT timer.
+ *
+ * This is also called after resume to bring the PIT into operation again.
+ */
+static void init_pit_timer(enum clock_event_mode mode,
+                           struct clock_event_device *evt)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&i8253_lock, flags);
+        switch(mode) {
+        case CLOCK_EVT_MODE_PERIODIC:
+                /* binary, mode 2, LSB/MSB, ch 0 */
+                outb_p(0x34, PIT_MODE);
+                outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
+                outb(LATCH >> 8 , PIT_CH0);     /* MSB */
+                break;
+        case CLOCK_EVT_MODE_SHUTDOWN:
+        case CLOCK_EVT_MODE_UNUSED:
+                if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
+                    evt->mode == CLOCK_EVT_MODE_ONESHOT) {
+                        outb_p(0x30, PIT_MODE);
+                        outb_p(0, PIT_CH0);
+                        outb_p(0, PIT_CH0);
+                }
+                break;
+        case CLOCK_EVT_MODE_ONESHOT:
+                /* One shot setup */
+                outb_p(0x38, PIT_MODE);
+                break;
+        case CLOCK_EVT_MODE_RESUME:
+                /* Nothing to do here */
+                break;
+        }
+        spin_unlock_irqrestore(&i8253_lock, flags);
+}
+/*
+ * Program the next event in oneshot mode
+ *
+ * Delta is given in PIT ticks
+ */
+static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&i8253_lock, flags);
+        outb_p(delta & 0xff , PIT_CH0); /* LSB */
+        outb(delta >> 8 , PIT_CH0);     /* MSB */
+        spin_unlock_irqrestore(&i8253_lock, flags);
+        return 0;
+}
+/*
+ * On UP the PIT can serve all of the possible timer functions. On SMP systems
+ * it can be solely used for the global tick.
+ *
+ * The profiling and update capabilites are switched off once the local apic is
+ * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
+ * !using_apic_timer decisions in do_timer_interrupt_hook()
+ */
+struct clock_event_device pit_clockevent = {
+        .name           = "pit",
+        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+        .set_mode       = init_pit_timer,
+        .set_next_event = pit_next_event,
+        .shift          = 32,
+        .irq            = 0,
+};
+/*
+ * Initialize the conversion factor and the min/max deltas of the clock event
+ * structure and register the clock event source with the framework.
+ */
+void __init setup_pit_timer(void)
+{
+        /*
+         * Start pit with the boot cpu mask and make it global after the
+         * IO_APIC has been initialized.
+         */
+        pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+        pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
+        pit_clockevent.max_delta_ns =
+                clockevent_delta2ns(0x7FFF, &pit_clockevent);
+        pit_clockevent.min_delta_ns =
+                clockevent_delta2ns(0xF, &pit_clockevent);
+        clockevents_register_device(&pit_clockevent);
+        global_clock_event = &pit_clockevent;
+}
+/*
+ * Since the PIT overflows every tick, its not very useful
+ * to just read by itself. So use jiffies to emulate a free
+ * running counter:
+ */
+static cycle_t pit_read(void)
+{
+        unsigned long flags;
+        int count;
+        u32 jifs;
+        static int old_count;
+        static u32 old_jifs;
+        spin_lock_irqsave(&i8253_lock, flags);
+        /*
+         * Although our caller may have the read side of xtime_lock,
+         * this is now a seqlock, and we are cheating in this routine
+         * by having side effects on state that we cannot undo if
+         * there is a collision on the seqlock and our caller has to
+         * retry.  (Namely, old_jifs and old_count.)  So we must treat
+         * jiffies as volatile despite the lock.  We read jiffies
+         * before latching the timer count to guarantee that although
+         * the jiffies value might be older than the count (that is,
+         * the counter may underflow between the last point where
+         * jiffies was incremented and the point where we latch the
+         * count), it cannot be newer.
+         */
+        jifs = jiffies;
+        outb_p(0x00, PIT_MODE); /* latch the count ASAP */
+        count = inb_p(PIT_CH0); /* read the latched count */
+        count |= inb_p(PIT_CH0) << 8;
+        /* VIA686a test code... reset the latch if count > max + 1 */
+        if (count > LATCH) {
+                outb_p(0x34, PIT_MODE);
+                outb_p(LATCH & 0xff, PIT_CH0);
+                outb(LATCH >> 8, PIT_CH0);
+                count = LATCH - 1;
+        }
+        /*
+         * It's possible for count to appear to go the wrong way for a
+         * couple of reasons:
+         *
+         *  1. The timer counter underflows, but we haven't handled the
+         *     resulting interrupt and incremented jiffies yet.
+         *  2. Hardware problem with the timer, not giving us continuous time,
+         *     the counter does small "jumps" upwards on some Pentium systems,
+         *     (see c't 95/10 page 335 for Neptun bug.)
+         *
+         * Previous attempts to handle these cases intelligently were
+         * buggy, so we just do the simple thing now.
+         */
+        if (count > old_count && jifs == old_jifs) {
+                count = old_count;
+        }
+        old_count = count;
+        old_jifs = jifs;
+        spin_unlock_irqrestore(&i8253_lock, flags);
+        count = (LATCH - 1) - count;
+        return (cycle_t)(jifs * LATCH) + count;
+}
+static struct clocksource clocksource_pit = {
+        .name   = "pit",
+        .rating = 110,
+        .read   = pit_read,
+        .mask   = CLOCKSOURCE_MASK(32),
+        .mult   = 0,
+        .shift  = 20,
+};
+static int __init init_pit_clocksource(void)
+{
+        if (num_possible_cpus() > 1) /* PIT does not scale! */
+                return 0;
+        clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
+        return clocksource_register(&clocksource_pit);
+}
+arch_initcall(init_pit_clocksource);
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
new file mode 100644
index 000000000000..0499cbe9871a
--- /dev/null
+++ b/arch/x86/kernel/i8259_32.c
@@ -0,0 +1,420 @@
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+#include <asm/8253pit.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/arch_hooks.h>
+#include <asm/i8259.h>
+#include <io_ports.h>
+/*
+ * This is the 'legacy' 8259A Programmable Interrupt Controller,
+ * present in the majority of PC/AT boxes.
+ * plus some generic x86 specific things if generic specifics makes
+ * any sense at all.
+ * this file should become arch/i386/kernel/irq.c when the old irq.c
+ * moves to arch independent land
+ */
+static int i8259A_auto_eoi;
+DEFINE_SPINLOCK(i8259A_lock);
+static void mask_and_ack_8259A(unsigned int);
+static struct irq_chip i8259A_chip = {
+        .name           = "XT-PIC",
+        .mask           = disable_8259A_irq,
+        .disable        = disable_8259A_irq,
+        .unmask         = enable_8259A_irq,
+        .mask_ack       = mask_and_ack_8259A,
+};
+/*
+ * 8259A PIC functions to handle ISA devices:
+ */
+/*
+ * This contains the irq mask for both 8259A irq controllers,
+ */
+unsigned int cached_irq_mask = 0xffff;
+/*
+ * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
+ * boards the timer interrupt is not really connected to any IO-APIC pin,
+ * it's fed to the master 8259A's IR0 line only.
+ *
+ * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
+ * this 'mixed mode' IRQ handling costs nothing because it's only used
+ * at IRQ setup time.
+ */
+unsigned long io_apic_irqs;
+void disable_8259A_irq(unsigned int irq)
+{
+        unsigned int mask = 1 << irq;
+        unsigned long flags;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        cached_irq_mask |= mask;
+        if (irq & 8)
+                outb(cached_slave_mask, PIC_SLAVE_IMR);
+        else
+                outb(cached_master_mask, PIC_MASTER_IMR);
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+void enable_8259A_irq(unsigned int irq)
+{
+        unsigned int mask = ~(1 << irq);
+        unsigned long flags;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        cached_irq_mask &= mask;
+        if (irq & 8)
+                outb(cached_slave_mask, PIC_SLAVE_IMR);
+        else
+                outb(cached_master_mask, PIC_MASTER_IMR);
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+int i8259A_irq_pending(unsigned int irq)
+{
+        unsigned int mask = 1<<irq;
+        unsigned long flags;
+        int ret;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        if (irq < 8)
+                ret = inb(PIC_MASTER_CMD) & mask;
+        else
+                ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+        return ret;
+}
+void make_8259A_irq(unsigned int irq)
+{
+        disable_irq_nosync(irq);
+        io_apic_irqs &= ~(1<<irq);
+        set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
+                                      "XT");
+        enable_irq(irq);
+}
+/*
+ * This function assumes to be called rarely. Switching between
+ * 8259A registers is slow.
+ * This has to be protected by the irq controller spinlock
+ * before being called.
+ */
+static inline int i8259A_irq_real(unsigned int irq)
+{
+        int value;
+        int irqmask = 1<<irq;
+        if (irq < 8) {
+                outb(0x0B,PIC_MASTER_CMD);      /* ISR register */
+                value = inb(PIC_MASTER_CMD) & irqmask;
+                outb(0x0A,PIC_MASTER_CMD);      /* back to the IRR register */
+                return value;
+        }
+        outb(0x0B,PIC_SLAVE_CMD);       /* ISR register */
+        value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
+        outb(0x0A,PIC_SLAVE_CMD);       /* back to the IRR register */
+        return value;
+}
+/*
+ * Careful! The 8259A is a fragile beast, it pretty
+ * much _has_ to be done exactly like this (mask it
+ * first, _then_ send the EOI, and the order of EOI
+ * to the two 8259s is important!
+ */
+static void mask_and_ack_8259A(unsigned int irq)
+{
+        unsigned int irqmask = 1 << irq;
+        unsigned long flags;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        /*
+         * Lightweight spurious IRQ detection. We do not want
+         * to overdo spurious IRQ handling - it's usually a sign
+         * of hardware problems, so we only do the checks we can
+         * do without slowing down good hardware unnecessarily.
+         *
+         * Note that IRQ7 and IRQ15 (the two spurious IRQs
+         * usually resulting from the 8259A-1|2 PICs) occur
+         * even if the IRQ is masked in the 8259A. Thus we
+         * can check spurious 8259A IRQs without doing the
+         * quite slow i8259A_irq_real() call for every IRQ.
+         * This does not cover 100% of spurious interrupts,
+         * but should be enough to warn the user that there
+         * is something bad going on ...
+         */
+        if (cached_irq_mask & irqmask)
+                goto spurious_8259A_irq;
+        cached_irq_mask |= irqmask;
+handle_real_irq:
+        if (irq & 8) {
+                inb(PIC_SLAVE_IMR);     /* DUMMY - (do we need this?) */
+                outb(cached_slave_mask, PIC_SLAVE_IMR);
+                outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
+                outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
+        } else {
+                inb(PIC_MASTER_IMR);    /* DUMMY - (do we need this?) */
+                outb(cached_master_mask, PIC_MASTER_IMR);
+                outb(0x60+irq,PIC_MASTER_CMD);  /* 'Specific EOI to master */
+        }
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+        return;
+spurious_8259A_irq:
+        /*
+         * this is the slow path - should happen rarely.
+         */
+        if (i8259A_irq_real(irq))
+                /*
+                 * oops, the IRQ _is_ in service according to the
+                 * 8259A - not spurious, go handle it.
+                 */
+                goto handle_real_irq;
+        {
+                static int spurious_irq_mask;
+                /*
+                 * At this point we can be sure the IRQ is spurious,
+                 * lets ACK and report it. [once per IRQ]
+                 */
+                if (!(spurious_irq_mask & irqmask)) {
+                        printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+                        spurious_irq_mask |= irqmask;
+                }
+                atomic_inc(&irq_err_count);
+                /*
+                 * Theoretically we do not have to handle this IRQ,
+                 * but in Linux this does not cause problems and is
+                 * simpler for us.
+                 */
+                goto handle_real_irq;
+        }
+}
+static char irq_trigger[2];
+/**
+ * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
+ */
+static void restore_ELCR(char *trigger)
+{
+        outb(trigger[0], 0x4d0);
+        outb(trigger[1], 0x4d1);
+}
+static void save_ELCR(char *trigger)
+{
+        /* IRQ 0,1,2,8,13 are marked as reserved */
+        trigger[0] = inb(0x4d0) & 0xF8;
+        trigger[1] = inb(0x4d1) & 0xDE;
+}
+static int i8259A_resume(struct sys_device *dev)
+{
+        init_8259A(i8259A_auto_eoi);
+        restore_ELCR(irq_trigger);
+        return 0;
+}
+static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
+{
+        save_ELCR(irq_trigger);
+        return 0;
+}
+static int i8259A_shutdown(struct sys_device *dev)
+{
+        /* Put the i8259A into a quiescent state that
+         * the kernel initialization code can get it
+         * out of.
+         */
+        outb(0xff, PIC_MASTER_IMR);     /* mask all of 8259A-1 */
+        outb(0xff, PIC_SLAVE_IMR);      /* mask all of 8259A-1 */
+        return 0;
+}
+static struct sysdev_class i8259_sysdev_class = {
+        set_kset_name("i8259"),
+        .suspend = i8259A_suspend,
+        .resume = i8259A_resume,
+        .shutdown = i8259A_shutdown,
+};
+static struct sys_device device_i8259A = {
+        .id     = 0,
+        .cls    = &i8259_sysdev_class,
+};
+static int __init i8259A_init_sysfs(void)
+{
+        int error = sysdev_class_register(&i8259_sysdev_class);
+        if (!error)
+                error = sysdev_register(&device_i8259A);
+        return error;
+}
+device_initcall(i8259A_init_sysfs);
+void init_8259A(int auto_eoi)
+{
+        unsigned long flags;
+        i8259A_auto_eoi = auto_eoi;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        outb(0xff, PIC_MASTER_IMR);     /* mask all of 8259A-1 */
+        outb(0xff, PIC_SLAVE_IMR);      /* mask all of 8259A-2 */
+        /*
+         * outb_p - this has to work on a wide range of PC hardware.
+         */
+        outb_p(0x11, PIC_MASTER_CMD);   /* ICW1: select 8259A-1 init */
+        outb_p(0x20 + 0, PIC_MASTER_IMR);       /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
+        outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);   /* 8259A-1 (the master) has a slave on IR2 */
+        if (auto_eoi)   /* master does Auto EOI */
+                outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
+        else            /* master expects normal EOI */
+                outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+        outb_p(0x11, PIC_SLAVE_CMD);    /* ICW1: select 8259A-2 init */
+        outb_p(0x20 + 8, PIC_SLAVE_IMR);        /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
+        outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR);  /* 8259A-2 is a slave on master's IR2 */
+        outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
+        if (auto_eoi)
+                /*
+                 * In AEOI mode we just have to mask the interrupt
+                 * when acking.
+                 */
+                i8259A_chip.mask_ack = disable_8259A_irq;
+        else
+                i8259A_chip.mask_ack = mask_and_ack_8259A;
+        udelay(100);            /* wait for 8259A to initialize */
+        outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+        outb(cached_slave_mask, PIC_SLAVE_IMR);   /* restore slave IRQ mask */
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+/*
+ * Note that on a 486, we don't want to do a SIGFPE on an irq13
+ * as the irq is unreliable, and exception 16 works correctly
+ * (ie as explained in the intel literature). On a 386, you
+ * can't use exception 16 due to bad IBM design, so we have to
+ * rely on the less exact irq13.
+ *
+ * Careful.. Not only is IRQ13 unreliable, but it is also
+ * leads to races. IBM designers who came up with it should
+ * be shot.
+ */
+ 
+static irqreturn_t math_error_irq(int cpl, void *dev_id)
+{
+        extern void math_error(void __user *);
+        outb(0,0xF0);
+        if (ignore_fpu_irq || !boot_cpu_data.hard_math)
+                return IRQ_NONE;
+        math_error((void __user *)get_irq_regs()->eip);
+        return IRQ_HANDLED;
+}
+/*
+ * New motherboards sometimes make IRQ 13 be a PCI interrupt,
+ * so allow interrupt sharing.
+ */
+static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL };
+void __init init_ISA_irqs (void)
+{
+        int i;
+#ifdef CONFIG_X86_LOCAL_APIC
+        init_bsp_APIC();
+#endif
+        init_8259A(0);
+        for (i = 0; i < NR_IRQS; i++) {
+                irq_desc[i].status = IRQ_DISABLED;
+                irq_desc[i].action = NULL;
+                irq_desc[i].depth = 1;
+                if (i < 16) {
+                        /*
+                         * 16 old-style INTA-cycle interrupts:
+                         */
+                        set_irq_chip_and_handler_name(i, &i8259A_chip,
+                                                      handle_level_irq, "XT");
+                } else {
+                        /*
+                         * 'high' PCI IRQs filled in on demand
+                         */
+                        irq_desc[i].chip = &no_irq_chip;
+                }
+        }
+}
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+void __init native_init_IRQ(void)
+{
+        int i;
+        /* all the set up before the call gates are initialised */
+        pre_intr_init_hook();
+        /*
+         * Cover the whole vector space, no vector can escape
+         * us. (some of these will be overridden and become
+         * 'special' SMP interrupts)
+         */
+        for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+                int vector = FIRST_EXTERNAL_VECTOR + i;
+                if (i >= NR_IRQS)
+                        break;
+                if (vector != SYSCALL_VECTOR) 
+                        set_intr_gate(vector, interrupt[i]);
+        }
+        /* setup after call gates are initialised (usually add in
+         * the architecture specific gates)
+         */
+        intr_init_hook();
+        /*
+         * External FPU? Set up irq13 if so, for
+         * original braindamaged IBM FERR coupling.
+         */
+        if (boot_cpu_data.hard_math && !cpu_has_fpu)
+                setup_irq(FPU_IRQ, &fpu_irq);
+        irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
new file mode 100644
index 000000000000..948cae646099
--- /dev/null
+++ b/arch/x86/kernel/i8259_64.c
@@ -0,0 +1,544 @@
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+#include <asm/acpi.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+/*
+ * Common place to define all x86 IRQ vectors
+ *
+ * This builds up the IRQ handler stubs using some ugly macros in irq.h
+ *
+ * These macros create the low-level assembly IRQ routines that save
+ * register context and call do_IRQ(). do_IRQ() then does all the
+ * operations that are needed to keep the AT (or SMP IOAPIC)
+ * interrupt-controller happy.
+ */
+#define BI(x,y) \
+        BUILD_IRQ(x##y)
+#define BUILD_16_IRQS(x) \
+        BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
+        BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
+        BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
+        BI(x,c) BI(x,d) BI(x,e) BI(x,f)
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these 
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+                                      BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
+BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
+BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
+BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
+#undef BUILD_16_IRQS
+#undef BI
+#define IRQ(x,y) \
+        IRQ##x##y##_interrupt
+#define IRQLIST_16(x) \
+        IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
+        IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
+        IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
+        IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
+/* for the irq vectors */
+static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
+                                          IRQLIST_16(0x2), IRQLIST_16(0x3),
+        IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
+        IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
+        IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
+};
+#undef IRQ
+#undef IRQLIST_16
+/*
+ * This is the 'legacy' 8259A Programmable Interrupt Controller,
+ * present in the majority of PC/AT boxes.
+ * plus some generic x86 specific things if generic specifics makes
+ * any sense at all.
+ * this file should become arch/i386/kernel/irq.c when the old irq.c
+ * moves to arch independent land
+ */
+static int i8259A_auto_eoi;
+DEFINE_SPINLOCK(i8259A_lock);
+static void mask_and_ack_8259A(unsigned int);
+static struct irq_chip i8259A_chip = {
+        .name           = "XT-PIC",
+        .mask           = disable_8259A_irq,
+        .disable        = disable_8259A_irq,
+        .unmask         = enable_8259A_irq,
+        .mask_ack       = mask_and_ack_8259A,
+};
+/*
+ * 8259A PIC functions to handle ISA devices:
+ */
+/*
+ * This contains the irq mask for both 8259A irq controllers,
+ */
+static unsigned int cached_irq_mask = 0xffff;
+#define __byte(x,y)     (((unsigned char *)&(y))[x])
+#define cached_21       (__byte(0,cached_irq_mask))
+#define cached_A1       (__byte(1,cached_irq_mask))
+/*
+ * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
+ * boards the timer interrupt is not really connected to any IO-APIC pin,
+ * it's fed to the master 8259A's IR0 line only.
+ *
+ * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
+ * this 'mixed mode' IRQ handling costs nothing because it's only used
+ * at IRQ setup time.
+ */
+unsigned long io_apic_irqs;
+void disable_8259A_irq(unsigned int irq)
+{
+        unsigned int mask = 1 << irq;
+        unsigned long flags;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        cached_irq_mask |= mask;
+        if (irq & 8)
+                outb(cached_A1,0xA1);
+        else
+                outb(cached_21,0x21);
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+void enable_8259A_irq(unsigned int irq)
+{
+        unsigned int mask = ~(1 << irq);
+        unsigned long flags;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        cached_irq_mask &= mask;
+        if (irq & 8)
+                outb(cached_A1,0xA1);
+        else
+                outb(cached_21,0x21);
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+int i8259A_irq_pending(unsigned int irq)
+{
+        unsigned int mask = 1<<irq;
+        unsigned long flags;
+        int ret;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        if (irq < 8)
+                ret = inb(0x20) & mask;
+        else
+                ret = inb(0xA0) & (mask >> 8);
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+        return ret;
+}
+void make_8259A_irq(unsigned int irq)
+{
+        disable_irq_nosync(irq);
+        io_apic_irqs &= ~(1<<irq);
+        set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
+                                      "XT");
+        enable_irq(irq);
+}
+/*
+ * This function assumes to be called rarely. Switching between
+ * 8259A registers is slow.
+ * This has to be protected by the irq controller spinlock
+ * before being called.
+ */
+static inline int i8259A_irq_real(unsigned int irq)
+{
+        int value;
+        int irqmask = 1<<irq;
+        if (irq < 8) {
+                outb(0x0B,0x20);                /* ISR register */
+                value = inb(0x20) & irqmask;
+                outb(0x0A,0x20);                /* back to the IRR register */
+                return value;
+        }
+        outb(0x0B,0xA0);                /* ISR register */
+        value = inb(0xA0) & (irqmask >> 8);
+        outb(0x0A,0xA0);                /* back to the IRR register */
+        return value;
+}
+/*
+ * Careful! The 8259A is a fragile beast, it pretty
+ * much _has_ to be done exactly like this (mask it
+ * first, _then_ send the EOI, and the order of EOI
+ * to the two 8259s is important!
+ */
+static void mask_and_ack_8259A(unsigned int irq)
+{
+        unsigned int irqmask = 1 << irq;
+        unsigned long flags;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        /*
+         * Lightweight spurious IRQ detection. We do not want
+         * to overdo spurious IRQ handling - it's usually a sign
+         * of hardware problems, so we only do the checks we can
+         * do without slowing down good hardware unnecessarily.
+         *
+         * Note that IRQ7 and IRQ15 (the two spurious IRQs
+         * usually resulting from the 8259A-1|2 PICs) occur
+         * even if the IRQ is masked in the 8259A. Thus we
+         * can check spurious 8259A IRQs without doing the
+         * quite slow i8259A_irq_real() call for every IRQ.
+         * This does not cover 100% of spurious interrupts,
+         * but should be enough to warn the user that there
+         * is something bad going on ...
+         */
+        if (cached_irq_mask & irqmask)
+                goto spurious_8259A_irq;
+        cached_irq_mask |= irqmask;
+handle_real_irq:
+        if (irq & 8) {
+                inb(0xA1);              /* DUMMY - (do we need this?) */
+                outb(cached_A1,0xA1);
+                outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */
+                outb(0x62,0x20);        /* 'Specific EOI' to master-IRQ2 */
+        } else {
+                inb(0x21);              /* DUMMY - (do we need this?) */
+                outb(cached_21,0x21);
+                outb(0x60+irq,0x20);    /* 'Specific EOI' to master */
+        }
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+        return;
+spurious_8259A_irq:
+        /*
+         * this is the slow path - should happen rarely.
+         */
+        if (i8259A_irq_real(irq))
+                /*
+                 * oops, the IRQ _is_ in service according to the
+                 * 8259A - not spurious, go handle it.
+                 */
+                goto handle_real_irq;
+        {
+                static int spurious_irq_mask;
+                /*
+                 * At this point we can be sure the IRQ is spurious,
+                 * lets ACK and report it. [once per IRQ]
+                 */
+                if (!(spurious_irq_mask & irqmask)) {
+                        printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+                        spurious_irq_mask |= irqmask;
+                }
+                atomic_inc(&irq_err_count);
+                /*
+                 * Theoretically we do not have to handle this IRQ,
+                 * but in Linux this does not cause problems and is
+                 * simpler for us.
+                 */
+                goto handle_real_irq;
+        }
+}
+void init_8259A(int auto_eoi)
+{
+        unsigned long flags;
+        i8259A_auto_eoi = auto_eoi;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        outb(0xff, 0x21);       /* mask all of 8259A-1 */
+        outb(0xff, 0xA1);       /* mask all of 8259A-2 */
+        /*
+         * outb_p - this has to work on a wide range of PC hardware.
+         */
+        outb_p(0x11, 0x20);     /* ICW1: select 8259A-1 init */
+        outb_p(IRQ0_VECTOR, 0x21);      /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
+        outb_p(0x04, 0x21);     /* 8259A-1 (the master) has a slave on IR2 */
+        if (auto_eoi)
+                outb_p(0x03, 0x21);     /* master does Auto EOI */
+        else
+                outb_p(0x01, 0x21);     /* master expects normal EOI */
+        outb_p(0x11, 0xA0);     /* ICW1: select 8259A-2 init */
+        outb_p(IRQ8_VECTOR, 0xA1);      /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
+        outb_p(0x02, 0xA1);     /* 8259A-2 is a slave on master's IR2 */
+        outb_p(0x01, 0xA1);     /* (slave's support for AEOI in flat mode
+                                    is to be investigated) */
+        if (auto_eoi)
+                /*
+                 * in AEOI mode we just have to mask the interrupt
+                 * when acking.
+                 */
+                i8259A_chip.mask_ack = disable_8259A_irq;
+        else
+                i8259A_chip.mask_ack = mask_and_ack_8259A;
+        udelay(100);            /* wait for 8259A to initialize */
+        outb(cached_21, 0x21);  /* restore master IRQ mask */
+        outb(cached_A1, 0xA1);  /* restore slave IRQ mask */
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+static char irq_trigger[2];
+/**
+ * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
+ */
+static void restore_ELCR(char *trigger)
+{
+        outb(trigger[0], 0x4d0);
+        outb(trigger[1], 0x4d1);
+}
+static void save_ELCR(char *trigger)
+{
+        /* IRQ 0,1,2,8,13 are marked as reserved */
+        trigger[0] = inb(0x4d0) & 0xF8;
+        trigger[1] = inb(0x4d1) & 0xDE;
+}
+static int i8259A_resume(struct sys_device *dev)
+{
+        init_8259A(i8259A_auto_eoi);
+        restore_ELCR(irq_trigger);
+        return 0;
+}
+static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
+{
+        save_ELCR(irq_trigger);
+        return 0;
+}
+static int i8259A_shutdown(struct sys_device *dev)
+{
+        /* Put the i8259A into a quiescent state that
+         * the kernel initialization code can get it
+         * out of.
+         */
+        outb(0xff, 0x21);       /* mask all of 8259A-1 */
+        outb(0xff, 0xA1);       /* mask all of 8259A-1 */
+        return 0;
+}
+static struct sysdev_class i8259_sysdev_class = {
+        set_kset_name("i8259"),
+        .suspend = i8259A_suspend,
+        .resume = i8259A_resume,
+        .shutdown = i8259A_shutdown,
+};
+static struct sys_device device_i8259A = {
+        .id     = 0,
+        .cls    = &i8259_sysdev_class,
+};
+static int __init i8259A_init_sysfs(void)
+{
+        int error = sysdev_class_register(&i8259_sysdev_class);
+        if (!error)
+                error = sysdev_register(&device_i8259A);
+        return error;
+}
+device_initcall(i8259A_init_sysfs);
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+        [0 ... IRQ0_VECTOR - 1] = -1,
+        [IRQ0_VECTOR] = 0,
+        [IRQ1_VECTOR] = 1,
+        [IRQ2_VECTOR] = 2,
+        [IRQ3_VECTOR] = 3,
+        [IRQ4_VECTOR] = 4,
+        [IRQ5_VECTOR] = 5,
+        [IRQ6_VECTOR] = 6,
+        [IRQ7_VECTOR] = 7,
+        [IRQ8_VECTOR] = 8,
+        [IRQ9_VECTOR] = 9,
+        [IRQ10_VECTOR] = 10,
+        [IRQ11_VECTOR] = 11,
+        [IRQ12_VECTOR] = 12,
+        [IRQ13_VECTOR] = 13,
+        [IRQ14_VECTOR] = 14,
+        [IRQ15_VECTOR] = 15,
+        [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+};
+void __init init_ISA_irqs (void)
+{
+        int i;
+        init_bsp_APIC();
+        init_8259A(0);
+        for (i = 0; i < NR_IRQS; i++) {
+                irq_desc[i].status = IRQ_DISABLED;
+                irq_desc[i].action = NULL;
+                irq_desc[i].depth = 1;
+                if (i < 16) {
+                        /*
+                         * 16 old-style INTA-cycle interrupts:
+                         */
+                        set_irq_chip_and_handler_name(i, &i8259A_chip,
+                                                      handle_level_irq, "XT");
+                } else {
+                        /*
+                         * 'high' PCI IRQs filled in on demand
+                         */
+                        irq_desc[i].chip = &no_irq_chip;
+                }
+        }
+}
+static void setup_timer_hardware(void)
+{
+        outb_p(0x34,0x43);              /* binary, mode 2, LSB/MSB, ch 0 */
+        udelay(10);
+        outb_p(LATCH & 0xff , 0x40);    /* LSB */
+        udelay(10);
+        outb(LATCH >> 8 , 0x40);        /* MSB */
+}
+static int timer_resume(struct sys_device *dev)
+{
+        setup_timer_hardware();
+        return 0;
+}
+void i8254_timer_resume(void)
+{
+        setup_timer_hardware();
+}
+static struct sysdev_class timer_sysclass = {
+        set_kset_name("timer_pit"),
+        .resume         = timer_resume,
+};
+static struct sys_device device_timer = {
+        .id             = 0,
+        .cls            = &timer_sysclass,
+};
+static int __init init_timer_sysfs(void)
+{
+        int error = sysdev_class_register(&timer_sysclass);
+        if (!error)
+                error = sysdev_register(&device_timer);
+        return error;
+}
+device_initcall(init_timer_sysfs);
+void __init init_IRQ(void)
+{
+        int i;
+        init_ISA_irqs();
+        /*
+         * Cover the whole vector space, no vector can escape
+         * us. (some of these will be overridden and become
+         * 'special' SMP interrupts)
+         */
+        for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+                int vector = FIRST_EXTERNAL_VECTOR + i;
+                if (vector != IA32_SYSCALL_VECTOR)
+                        set_intr_gate(vector, interrupt[i]);
+        }
+#ifdef CONFIG_SMP
+        /*
+         * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+         * IPI, driven by wakeup.
+         */
+        set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+        /* IPIs for invalidation */
+        set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+        set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+        set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+        set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+        set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+        set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+        set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+        set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+        /* IPI for generic function call */
+        set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+        /* Low priority IPI to cleanup after moving an irq */
+        set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+#endif
+        set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+        set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+        /* self generated IPI for local APIC timer */
+        set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+        /* IPI vectors for APIC spurious and error interrupts */
+        set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+        set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+        /*
+         * Set the clock to HZ Hz, we already have a valid
+         * vector now:
+         */
+        setup_timer_hardware();
+        if (!acpi_ioapic)
+                setup_irq(2, &irq2);
+}
diff --git a/arch/x86/kernel/init_task_32.c b/arch/x86/kernel/init_task_32.c
new file mode 100644
index 000000000000..d26fc063a760
--- /dev/null
+++ b/arch/x86/kernel/init_task_32.c
@@ -0,0 +1,46 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+#include <linux/mqueue.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+static struct fs_struct init_fs = INIT_FS;
+static struct files_struct init_files = INIT_FILES;
+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+struct mm_struct init_mm = INIT_MM(init_mm);
+EXPORT_SYMBOL(init_mm);
+/*
+ * Initial thread structure.
+ *
+ * We need to make sure that this is THREAD_SIZE aligned due to the
+ * way process stacks are handled. This is done by having a special
+ * "init_task" linker map entry..
+ */
+union thread_union init_thread_union 
+        __attribute__((__section__(".data.init_task"))) =
+                { INIT_THREAD_INFO(init_task) };
+/*
+ * Initial task structure.
+ *
+ * All other task structs will be allocated on slabs in fork.c
+ */
+struct task_struct init_task = INIT_TASK(init_task);
+EXPORT_SYMBOL(init_task);
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's.
+ */ 
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
diff --git a/arch/x86/kernel/init_task_64.c b/arch/x86/kernel/init_task_64.c
new file mode 100644
index 000000000000..4ff33d4f8551
--- /dev/null
+++ b/arch/x86/kernel/init_task_64.c
@@ -0,0 +1,54 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+#include <linux/mqueue.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+static struct fs_struct init_fs = INIT_FS;
+static struct files_struct init_files = INIT_FILES;
+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+struct mm_struct init_mm = INIT_MM(init_mm);
+EXPORT_SYMBOL(init_mm);
+/*
+ * Initial task structure.
+ *
+ * We need to make sure that this is 8192-byte aligned due to the
+ * way process stacks are handled. This is done by having a special
+ * "init_task" linker map entry..
+ */
+union thread_union init_thread_union 
+        __attribute__((__section__(".data.init_task"))) =
+                { INIT_THREAD_INFO(init_task) };
+/*
+ * Initial task structure.
+ *
+ * All other task structs will be allocated on slabs in fork.c
+ */
+struct task_struct init_task = INIT_TASK(init_task);
+EXPORT_SYMBOL(init_task);
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data.cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */ 
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+/* Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+#define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
new file mode 100644
index 000000000000..e2f4a1c68547
--- /dev/null
+++ b/arch/x86/kernel/io_apic_32.c
@@ -0,0 +1,2847 @@
+/*
+ *      Intel IO-APIC support for multi-Pentium hosts.
+ *
+ *      Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ *
+ *      Many thanks to Stig Venaas for trying out countless experimental
+ *      patches and reporting/debugging problems patiently!
+ *
+ *      (c) 1999, Multiple IO-APIC support, developed by
+ *      Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ *      further tested and cleaned up by Zach Brown <zab@redhat.com>
+ *      and Ingo Molnar <mingo@redhat.com>
+ *
+ *      Fixes
+ *      Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
+ *                                      thanks to Eric Gilmore
+ *                                      and Rolf G. Tews
+ *                                      for testing these extensively
+ *      Paul Diefenbaugh        :       Added full ACPI support
+ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/pci.h>
+#include <linux/msi.h>
+#include <linux/htirq.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/nmi.h>
+#include <asm/msidef.h>
+#include <asm/hypertransport.h>
+#include <mach_apic.h>
+#include <mach_apicdef.h>
+#include "io_ports.h"
+int (*ioapic_renumber_irq)(int ioapic, int irq);
+atomic_t irq_mis_count;
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_SPINLOCK(vector_lock);
+int timer_over_8254 __initdata = 1;
+/*
+ *      Is the SiS APIC rmw bug present ?
+ *      -1 = don't know, 0 = no, 1 = yes
+ */
+int sis_apic_bug = -1;
+/*
+ * # of IRQ routing registers
+ */
+int nr_ioapic_registers[MAX_IO_APICS];
+static int disable_timer_pin_1 __initdata;
+/*
+ * Rough estimation of how many shared IRQs there are, can
+ * be changed anytime.
+ */
+#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+static struct irq_pin_list {
+        int apic, pin, next;
+} irq_2_pin[PIN_MAP_SIZE];
+struct io_apic {
+        unsigned int index;
+        unsigned int unused[3];
+        unsigned int data;
+};
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+        return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+                + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+}
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+        struct io_apic __iomem *io_apic = io_apic_base(apic);
+        writel(reg, &io_apic->index);
+        return readl(&io_apic->data);
+}
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+        struct io_apic __iomem *io_apic = io_apic_base(apic);
+        writel(reg, &io_apic->index);
+        writel(value, &io_apic->data);
+}
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index register
+ */
+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+{
+        volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
+        if (sis_apic_bug)
+                writel(reg, &io_apic->index);
+        writel(value, &io_apic->data);
+}
+union entry_union {
+        struct { u32 w1, w2; };
+        struct IO_APIC_route_entry entry;
+};
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+        union entry_union eu;
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+        eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return eu.entry;
+}
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void
+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+        union entry_union eu;
+        eu.entry = e;
+        io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+        io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        __ioapic_write_entry(apic, pin, e);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+        unsigned long flags;
+        union entry_union eu = { .entry.mask = 1 };
+        spin_lock_irqsave(&ioapic_lock, flags);
+        io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+        io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+        static int first_free_entry = NR_IRQS;
+        struct irq_pin_list *entry = irq_2_pin + irq;
+        while (entry->next)
+                entry = irq_2_pin + entry->next;
+        if (entry->pin != -1) {
+                entry->next = first_free_entry;
+                entry = irq_2_pin + entry->next;
+                if (++first_free_entry >= PIN_MAP_SIZE)
+                        panic("io_apic.c: whoops");
+        }
+        entry->apic = apic;
+        entry->pin = pin;
+}
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq(unsigned int irq,
+                                      int oldapic, int oldpin,
+                                      int newapic, int newpin)
+{
+        struct irq_pin_list *entry = irq_2_pin + irq;
+        while (1) {
+                if (entry->apic == oldapic && entry->pin == oldpin) {
+                        entry->apic = newapic;
+                        entry->pin = newpin;
+                }
+                if (!entry->next)
+                        break;
+                entry = irq_2_pin + entry->next;
+        }
+}
+static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
+{
+        struct irq_pin_list *entry = irq_2_pin + irq;
+        unsigned int pin, reg;
+        for (;;) {
+                pin = entry->pin;
+                if (pin == -1)
+                        break;
+                reg = io_apic_read(entry->apic, 0x10 + pin*2);
+                reg &= ~disable;
+                reg |= enable;
+                io_apic_modify(entry->apic, 0x10 + pin*2, reg);
+                if (!entry->next)
+                        break;
+                entry = irq_2_pin + entry->next;
+        }
+}
+/* mask = 1 */
+static void __mask_IO_APIC_irq (unsigned int irq)
+{
+        __modify_IO_APIC_irq(irq, 0x00010000, 0);
+}
+/* mask = 0 */
+static void __unmask_IO_APIC_irq (unsigned int irq)
+{
+        __modify_IO_APIC_irq(irq, 0, 0x00010000);
+}
+/* mask = 1, trigger = 0 */
+static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
+{
+        __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+}
+/* mask = 0, trigger = 1 */
+static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
+{
+        __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
+}
+static void mask_IO_APIC_irq (unsigned int irq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        __mask_IO_APIC_irq(irq);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+static void unmask_IO_APIC_irq (unsigned int irq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        __unmask_IO_APIC_irq(irq);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+        struct IO_APIC_route_entry entry;
+        
+        /* Check delivery_mode to be sure we're not clearing an SMI pin */
+        entry = ioapic_read_entry(apic, pin);
+        if (entry.delivery_mode == dest_SMI)
+                return;
+        /*
+         * Disable it in the IO-APIC irq-routing table:
+         */
+        ioapic_mask_entry(apic, pin);
+}
+static void clear_IO_APIC (void)
+{
+        int apic, pin;
+        for (apic = 0; apic < nr_ioapics; apic++)
+                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+                        clear_IO_APIC_pin(apic, pin);
+}
+#ifdef CONFIG_SMP
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
+{
+        unsigned long flags;
+        int pin;
+        struct irq_pin_list *entry = irq_2_pin + irq;
+        unsigned int apicid_value;
+        cpumask_t tmp;
+        
+        cpus_and(tmp, cpumask, cpu_online_map);
+        if (cpus_empty(tmp))
+                tmp = TARGET_CPUS;
+        cpus_and(cpumask, tmp, CPU_MASK_ALL);
+        apicid_value = cpu_mask_to_apicid(cpumask);
+        /* Prepare to do the io_apic_write */
+        apicid_value = apicid_value << 24;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        for (;;) {
+                pin = entry->pin;
+                if (pin == -1)
+                        break;
+                io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
+                if (!entry->next)
+                        break;
+                entry = irq_2_pin + entry->next;
+        }
+        irq_desc[irq].affinity = cpumask;
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+#if defined(CONFIG_IRQBALANCE)
+# include <asm/processor.h>     /* kernel_thread() */
+# include <linux/kernel_stat.h> /* kstat */
+# include <linux/slab.h>                /* kmalloc() */
+# include <linux/timer.h>       /* time_after() */
+ 
+#define IRQBALANCE_CHECK_ARCH -999
+#define MAX_BALANCED_IRQ_INTERVAL       (5*HZ)
+#define MIN_BALANCED_IRQ_INTERVAL       (HZ/2)
+#define BALANCED_IRQ_MORE_DELTA         (HZ/10)
+#define BALANCED_IRQ_LESS_DELTA         (HZ)
+static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
+static int physical_balance __read_mostly;
+static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
+static struct irq_cpu_info {
+        unsigned long * last_irq;
+        unsigned long * irq_delta;
+        unsigned long irq;
+} irq_cpu_data[NR_CPUS];
+#define CPU_IRQ(cpu)            (irq_cpu_data[cpu].irq)
+#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu,irq)      (irq_cpu_data[cpu].irq_delta[irq])
+#define IDLE_ENOUGH(cpu,now) \
+        (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
+#define IRQ_ALLOWED(cpu, allowed_mask)  cpu_isset(cpu, allowed_mask)
+#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
+static cpumask_t balance_irq_affinity[NR_IRQS] = {
+        [0 ... NR_IRQS-1] = CPU_MASK_ALL
+};
+void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+        balance_irq_affinity[irq] = mask;
+}
+static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
+                        unsigned long now, int direction)
+{
+        int search_idle = 1;
+        int cpu = curr_cpu;
+        goto inside;
+        do {
+                if (unlikely(cpu == curr_cpu))
+                        search_idle = 0;
+inside:
+                if (direction == 1) {
+                        cpu++;
+                        if (cpu >= NR_CPUS)
+                                cpu = 0;
+                } else {
+                        cpu--;
+                        if (cpu == -1)
+                                cpu = NR_CPUS-1;
+                }
+        } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
+                        (search_idle && !IDLE_ENOUGH(cpu,now)));
+        return cpu;
+}
+static inline void balance_irq(int cpu, int irq)
+{
+        unsigned long now = jiffies;
+        cpumask_t allowed_mask;
+        unsigned int new_cpu;
+                
+        if (irqbalance_disabled)
+                return; 
+        cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
+        new_cpu = move(cpu, allowed_mask, now, 1);
+        if (cpu != new_cpu) {
+                set_pending_irq(irq, cpumask_of_cpu(new_cpu));
+        }
+}
+static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
+{
+        int i, j;
+        for_each_online_cpu(i) {
+                for (j = 0; j < NR_IRQS; j++) {
+                        if (!irq_desc[j].action)
+                                continue;
+                        /* Is it a significant load ?  */
+                        if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
+                                                useful_load_threshold)
+                                continue;
+                        balance_irq(i, j);
+                }
+        }
+        balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+                balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
+        return;
+}
+static void do_irq_balance(void)
+{
+        int i, j;
+        unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
+        unsigned long move_this_load = 0;
+        int max_loaded = 0, min_loaded = 0;
+        int load;
+        unsigned long useful_load_threshold = balanced_irq_interval + 10;
+        int selected_irq;
+        int tmp_loaded, first_attempt = 1;
+        unsigned long tmp_cpu_irq;
+        unsigned long imbalance = 0;
+        cpumask_t allowed_mask, target_cpu_mask, tmp;
+        for_each_possible_cpu(i) {
+                int package_index;
+                CPU_IRQ(i) = 0;
+                if (!cpu_online(i))
+                        continue;
+                package_index = CPU_TO_PACKAGEINDEX(i);
+                for (j = 0; j < NR_IRQS; j++) {
+                        unsigned long value_now, delta;
+                        /* Is this an active IRQ or balancing disabled ? */
+                        if (!irq_desc[j].action || irq_balancing_disabled(j))
+                                continue;
+                        if ( package_index == i )
+                                IRQ_DELTA(package_index,j) = 0;
+                        /* Determine the total count per processor per IRQ */
+                        value_now = (unsigned long) kstat_cpu(i).irqs[j];
+                        /* Determine the activity per processor per IRQ */
+                        delta = value_now - LAST_CPU_IRQ(i,j);
+                        /* Update last_cpu_irq[][] for the next time */
+                        LAST_CPU_IRQ(i,j) = value_now;
+                        /* Ignore IRQs whose rate is less than the clock */
+                        if (delta < useful_load_threshold)
+                                continue;
+                        /* update the load for the processor or package total */
+                        IRQ_DELTA(package_index,j) += delta;
+                        /* Keep track of the higher numbered sibling as well */
+                        if (i != package_index)
+                                CPU_IRQ(i) += delta;
+                        /*
+                         * We have sibling A and sibling B in the package
+                         *
+                         * cpu_irq[A] = load for cpu A + load for cpu B
+                         * cpu_irq[B] = load for cpu B
+                         */
+                        CPU_IRQ(package_index) += delta;
+                }
+        }
+        /* Find the least loaded processor package */
+        for_each_online_cpu(i) {
+                if (i != CPU_TO_PACKAGEINDEX(i))
+                        continue;
+                if (min_cpu_irq > CPU_IRQ(i)) {
+                        min_cpu_irq = CPU_IRQ(i);
+                        min_loaded = i;
+                }
+        }
+        max_cpu_irq = ULONG_MAX;
+tryanothercpu:
+        /* Look for heaviest loaded processor.
+         * We may come back to get the next heaviest loaded processor.
+         * Skip processors with trivial loads.
+         */
+        tmp_cpu_irq = 0;
+        tmp_loaded = -1;
+        for_each_online_cpu(i) {
+                if (i != CPU_TO_PACKAGEINDEX(i))
+                        continue;
+                if (max_cpu_irq <= CPU_IRQ(i)) 
+                        continue;
+                if (tmp_cpu_irq < CPU_IRQ(i)) {
+                        tmp_cpu_irq = CPU_IRQ(i);
+                        tmp_loaded = i;
+                }
+        }
+        if (tmp_loaded == -1) {
+         /* In the case of small number of heavy interrupt sources, 
+          * loading some of the cpus too much. We use Ingo's original 
+          * approach to rotate them around.
+          */
+                if (!first_attempt && imbalance >= useful_load_threshold) {
+                        rotate_irqs_among_cpus(useful_load_threshold);
+                        return;
+                }
+                goto not_worth_the_effort;
+        }
+        
+        first_attempt = 0;              /* heaviest search */
+        max_cpu_irq = tmp_cpu_irq;      /* load */
+        max_loaded = tmp_loaded;        /* processor */
+        imbalance = (max_cpu_irq - min_cpu_irq) / 2;
+        
+        /* if imbalance is less than approx 10% of max load, then
+         * observe diminishing returns action. - quit
+         */
+        if (imbalance < (max_cpu_irq >> 3))
+                goto not_worth_the_effort;
+tryanotherirq:
+        /* if we select an IRQ to move that can't go where we want, then
+         * see if there is another one to try.
+         */
+        move_this_load = 0;
+        selected_irq = -1;
+        for (j = 0; j < NR_IRQS; j++) {
+                /* Is this an active IRQ? */
+                if (!irq_desc[j].action)
+                        continue;
+                if (imbalance <= IRQ_DELTA(max_loaded,j))
+                        continue;
+                /* Try to find the IRQ that is closest to the imbalance
+                 * without going over.
+                 */
+                if (move_this_load < IRQ_DELTA(max_loaded,j)) {
+                        move_this_load = IRQ_DELTA(max_loaded,j);
+                        selected_irq = j;
+                }
+        }
+        if (selected_irq == -1) {
+                goto tryanothercpu;
+        }
+        imbalance = move_this_load;
+        
+        /* For physical_balance case, we accumlated both load
+         * values in the one of the siblings cpu_irq[],
+         * to use the same code for physical and logical processors
+         * as much as possible. 
+         *
+         * NOTE: the cpu_irq[] array holds the sum of the load for
+         * sibling A and sibling B in the slot for the lowest numbered
+         * sibling (A), _AND_ the load for sibling B in the slot for
+         * the higher numbered sibling.
+         *
+         * We seek the least loaded sibling by making the comparison
+         * (A+B)/2 vs B
+         */
+        load = CPU_IRQ(min_loaded) >> 1;
+        for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
+                if (load > CPU_IRQ(j)) {
+                        /* This won't change cpu_sibling_map[min_loaded] */
+                        load = CPU_IRQ(j);
+                        min_loaded = j;
+                }
+        }
+        cpus_and(allowed_mask,
+                cpu_online_map,
+                balance_irq_affinity[selected_irq]);
+        target_cpu_mask = cpumask_of_cpu(min_loaded);
+        cpus_and(tmp, target_cpu_mask, allowed_mask);
+        if (!cpus_empty(tmp)) {
+                /* mark for change destination */
+                set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
+                /* Since we made a change, come back sooner to 
+                 * check for more variation.
+                 */
+                balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+                        balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
+                return;
+        }
+        goto tryanotherirq;
+not_worth_the_effort:
+        /*
+         * if we did not find an IRQ to move, then adjust the time interval
+         * upward
+         */
+        balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
+                balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);       
+        return;
+}
+static int balanced_irq(void *unused)
+{
+        int i;
+        unsigned long prev_balance_time = jiffies;
+        long time_remaining = balanced_irq_interval;
+        /* push everything to CPU 0 to give us a starting point.  */
+        for (i = 0 ; i < NR_IRQS ; i++) {
+                irq_desc[i].pending_mask = cpumask_of_cpu(0);
+                set_pending_irq(i, cpumask_of_cpu(0));
+        }
+        set_freezable();
+        for ( ; ; ) {
+                time_remaining = schedule_timeout_interruptible(time_remaining);
+                try_to_freeze();
+                if (time_after(jiffies,
+                                prev_balance_time+balanced_irq_interval)) {
+                        preempt_disable();
+                        do_irq_balance();
+                        prev_balance_time = jiffies;
+                        time_remaining = balanced_irq_interval;
+                        preempt_enable();
+                }
+        }
+        return 0;
+}
+static int __init balanced_irq_init(void)
+{
+        int i;
+        struct cpuinfo_x86 *c;
+        cpumask_t tmp;
+        cpus_shift_right(tmp, cpu_online_map, 2);
+        c = &boot_cpu_data;
+        /* When not overwritten by the command line ask subarchitecture. */
+        if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
+                irqbalance_disabled = NO_BALANCE_IRQ;
+        if (irqbalance_disabled)
+                return 0;
+        
+         /* disable irqbalance completely if there is only one processor online */
+        if (num_online_cpus() < 2) {
+                irqbalance_disabled = 1;
+                return 0;
+        }
+        /*
+         * Enable physical balance only if more than 1 physical processor
+         * is present
+         */
+        if (smp_num_siblings > 1 && !cpus_empty(tmp))
+                physical_balance = 1;
+        for_each_online_cpu(i) {
+                irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+                irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+                if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
+                        printk(KERN_ERR "balanced_irq_init: out of memory");
+                        goto failed;
+                }
+                memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
+                memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
+        }
+        
+        printk(KERN_INFO "Starting balanced_irq\n");
+        if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
+                return 0;
+        printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
+failed:
+        for_each_possible_cpu(i) {
+                kfree(irq_cpu_data[i].irq_delta);
+                irq_cpu_data[i].irq_delta = NULL;
+                kfree(irq_cpu_data[i].last_irq);
+                irq_cpu_data[i].last_irq = NULL;
+        }
+        return 0;
+}
+int __devinit irqbalance_disable(char *str)
+{
+        irqbalance_disabled = 1;
+        return 1;
+}
+__setup("noirqbalance", irqbalance_disable);
+late_initcall(balanced_irq_init);
+#endif /* CONFIG_IRQBALANCE */
+#endif /* CONFIG_SMP */
+#ifndef CONFIG_SMP
+void fastcall send_IPI_self(int vector)
+{
+        unsigned int cfg;
+        /*
+         * Wait for idle.
+         */
+        apic_wait_icr_idle();
+        cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
+        /*
+         * Send the IPI. The write to APIC_ICR fires this off.
+         */
+        apic_write_around(APIC_ICR, cfg);
+}
+#endif /* !CONFIG_SMP */
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+#define MAX_PIRQS 8
+static int pirq_entries [MAX_PIRQS];
+static int pirqs_enabled;
+int skip_ioapic_setup;
+static int __init ioapic_pirq_setup(char *str)
+{
+        int i, max;
+        int ints[MAX_PIRQS+1];
+        get_options(str, ARRAY_SIZE(ints), ints);
+        for (i = 0; i < MAX_PIRQS; i++)
+                pirq_entries[i] = -1;
+        pirqs_enabled = 1;
+        apic_printk(APIC_VERBOSE, KERN_INFO
+                        "PIRQ redirection, working around broken MP-BIOS.\n");
+        max = MAX_PIRQS;
+        if (ints[0] < MAX_PIRQS)
+                max = ints[0];
+        for (i = 0; i < max; i++) {
+                apic_printk(APIC_VERBOSE, KERN_DEBUG
+                                "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+                /*
+                 * PIRQs are mapped upside down, usually.
+                 */
+                pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+        }
+        return 1;
+}
+__setup("pirq=", ioapic_pirq_setup);
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int apic, int pin, int type)
+{
+        int i;
+        for (i = 0; i < mp_irq_entries; i++)
+                if (mp_irqs[i].mpc_irqtype == type &&
+                    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
+                     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
+                    mp_irqs[i].mpc_dstirq == pin)
+                        return i;
+        return -1;
+}
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+        int i;
+        for (i = 0; i < mp_irq_entries; i++) {
+                int lbus = mp_irqs[i].mpc_srcbus;
+                if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
+                     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
+                     mp_bus_id_to_type[lbus] == MP_BUS_MCA
+                    ) &&
+                    (mp_irqs[i].mpc_irqtype == type) &&
+                    (mp_irqs[i].mpc_srcbusirq == irq))
+                        return mp_irqs[i].mpc_dstirq;
+        }
+        return -1;
+}
+static int __init find_isa_irq_apic(int irq, int type)
+{
+        int i;
+        for (i = 0; i < mp_irq_entries; i++) {
+                int lbus = mp_irqs[i].mpc_srcbus;
+                if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
+                     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
+                     mp_bus_id_to_type[lbus] == MP_BUS_MCA
+                    ) &&
+                    (mp_irqs[i].mpc_irqtype == type) &&
+                    (mp_irqs[i].mpc_srcbusirq == irq))
+                        break;
+        }
+        if (i < mp_irq_entries) {
+                int apic;
+                for(apic = 0; apic < nr_ioapics; apic++) {
+                        if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+                                return apic;
+                }
+        }
+        return -1;
+}
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+static int pin_2_irq(int idx, int apic, int pin);
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
+{
+        int apic, i, best_guess = -1;
+        apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
+                "slot:%d, pin:%d.\n", bus, slot, pin);
+        if (mp_bus_id_to_pci_bus[bus] == -1) {
+                printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+                return -1;
+        }
+        for (i = 0; i < mp_irq_entries; i++) {
+                int lbus = mp_irqs[i].mpc_srcbus;
+                for (apic = 0; apic < nr_ioapics; apic++)
+                        if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+                            mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+                                break;
+                if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
+                    !mp_irqs[i].mpc_irqtype &&
+                    (bus == lbus) &&
+                    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
+                        int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+                        if (!(apic || IO_APIC_IRQ(irq)))
+                                continue;
+                        if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+                                return irq;
+                        /*
+                         * Use the first all-but-pin matching entry as a
+                         * best-guess fuzzy result for broken mptables.
+                         */
+                        if (best_guess < 0)
+                                best_guess = irq;
+                }
+        }
+        return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+/*
+ * This function currently is only a helper for the i386 smp boot process where 
+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
+ * so mask in all cases should simply be TARGET_CPUS
+ */
+#ifdef CONFIG_SMP
+void __init setup_ioapic_dest(void)
+{
+        int pin, ioapic, irq, irq_entry;
+        if (skip_ioapic_setup == 1)
+                return;
+        for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
+                for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+                        irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+                        if (irq_entry == -1)
+                                continue;
+                        irq = pin_2_irq(irq_entry, ioapic, pin);
+                        set_ioapic_affinity_irq(irq, TARGET_CPUS);
+                }
+        }
+}
+#endif
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+        if (irq < 16) {
+                unsigned int port = 0x4d0 + (irq >> 3);
+                return (inb(port) >> (irq & 7)) & 1;
+        }
+        apic_printk(APIC_VERBOSE, KERN_INFO
+                        "Broken MPtable reports ISA irq %d\n", irq);
+        return 0;
+}
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value.  If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+#define default_EISA_trigger(idx)       (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_polarity(idx)      (0)
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+#define default_ISA_trigger(idx)        (0)
+#define default_ISA_polarity(idx)       (0)
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+#define default_PCI_trigger(idx)        (1)
+#define default_PCI_polarity(idx)       (1)
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+#define default_MCA_trigger(idx)        (1)
+#define default_MCA_polarity(idx)       (0)
+static int __init MPBIOS_polarity(int idx)
+{
+        int bus = mp_irqs[idx].mpc_srcbus;
+        int polarity;
+        /*
+         * Determine IRQ line polarity (high active or low active):
+         */
+        switch (mp_irqs[idx].mpc_irqflag & 3)
+        {
+                case 0: /* conforms, ie. bus-type dependent polarity */
+                {
+                        switch (mp_bus_id_to_type[bus])
+                        {
+                                case MP_BUS_ISA: /* ISA pin */
+                                {
+                                        polarity = default_ISA_polarity(idx);
+                                        break;
+                                }
+                                case MP_BUS_EISA: /* EISA pin */
+                                {
+                                        polarity = default_EISA_polarity(idx);
+                                        break;
+                                }
+                                case MP_BUS_PCI: /* PCI pin */
+                                {
+                                        polarity = default_PCI_polarity(idx);
+                                        break;
+                                }
+                                case MP_BUS_MCA: /* MCA pin */
+                                {
+                                        polarity = default_MCA_polarity(idx);
+                                        break;
+                                }
+                                default:
+                                {
+                                        printk(KERN_WARNING "broken BIOS!!\n");
+                                        polarity = 1;
+                                        break;
+                                }
+                        }
+                        break;
+                }
+                case 1: /* high active */
+                {
+                        polarity = 0;
+                        break;
+                }
+                case 2: /* reserved */
+                {
+                        printk(KERN_WARNING "broken BIOS!!\n");
+                        polarity = 1;
+                        break;
+                }
+                case 3: /* low active */
+                {
+                        polarity = 1;
+                        break;
+                }
+                default: /* invalid */
+                {
+                        printk(KERN_WARNING "broken BIOS!!\n");
+                        polarity = 1;
+                        break;
+                }
+        }
+        return polarity;
+}
+static int MPBIOS_trigger(int idx)
+{
+        int bus = mp_irqs[idx].mpc_srcbus;
+        int trigger;
+        /*
+         * Determine IRQ trigger mode (edge or level sensitive):
+         */
+        switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+        {
+                case 0: /* conforms, ie. bus-type dependent */
+                {
+                        switch (mp_bus_id_to_type[bus])
+                        {
+                                case MP_BUS_ISA: /* ISA pin */
+                                {
+                                        trigger = default_ISA_trigger(idx);
+                                        break;
+                                }
+                                case MP_BUS_EISA: /* EISA pin */
+                                {
+                                        trigger = default_EISA_trigger(idx);
+                                        break;
+                                }
+                                case MP_BUS_PCI: /* PCI pin */
+                                {
+                                        trigger = default_PCI_trigger(idx);
+                                        break;
+                                }
+                                case MP_BUS_MCA: /* MCA pin */
+                                {
+                                        trigger = default_MCA_trigger(idx);
+                                        break;
+                                }
+                                default:
+                                {
+                                        printk(KERN_WARNING "broken BIOS!!\n");
+                                        trigger = 1;
+                                        break;
+                                }
+                        }
+                        break;
+                }
+                case 1: /* edge */
+                {
+                        trigger = 0;
+                        break;
+                }
+                case 2: /* reserved */
+                {
+                        printk(KERN_WARNING "broken BIOS!!\n");
+                        trigger = 1;
+                        break;
+                }
+                case 3: /* level */
+                {
+                        trigger = 1;
+                        break;
+                }
+                default: /* invalid */
+                {
+                        printk(KERN_WARNING "broken BIOS!!\n");
+                        trigger = 0;
+                        break;
+                }
+        }
+        return trigger;
+}
+static inline int irq_polarity(int idx)
+{
+        return MPBIOS_polarity(idx);
+}
+static inline int irq_trigger(int idx)
+{
+        return MPBIOS_trigger(idx);
+}
+static int pin_2_irq(int idx, int apic, int pin)
+{
+        int irq, i;
+        int bus = mp_irqs[idx].mpc_srcbus;
+        /*
+         * Debugging check, we are in big trouble if this message pops up!
+         */
+        if (mp_irqs[idx].mpc_dstirq != pin)
+                printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+        switch (mp_bus_id_to_type[bus])
+        {
+                case MP_BUS_ISA: /* ISA pin */
+                case MP_BUS_EISA:
+                case MP_BUS_MCA:
+                {
+                        irq = mp_irqs[idx].mpc_srcbusirq;
+                        break;
+                }
+                case MP_BUS_PCI: /* PCI pin */
+                {
+                        /*
+                         * PCI IRQs are mapped in order
+                         */
+                        i = irq = 0;
+                        while (i < apic)
+                                irq += nr_ioapic_registers[i++];
+                        irq += pin;
+                        /*
+                         * For MPS mode, so far only needed by ES7000 platform
+                         */
+                        if (ioapic_renumber_irq)
+                                irq = ioapic_renumber_irq(apic, irq);
+                        break;
+                }
+                default:
+                {
+                        printk(KERN_ERR "unknown bus type %d.\n",bus); 
+                        irq = 0;
+                        break;
+                }
+        }
+        /*
+         * PCI IRQ command line redirection. Yes, limits are hardcoded.
+         */
+        if ((pin >= 16) && (pin <= 23)) {
+                if (pirq_entries[pin-16] != -1) {
+                        if (!pirq_entries[pin-16]) {
+                                apic_printk(APIC_VERBOSE, KERN_DEBUG
+                                                "disabling PIRQ%d\n", pin-16);
+                        } else {
+                                irq = pirq_entries[pin-16];
+                                apic_printk(APIC_VERBOSE, KERN_DEBUG
+                                                "using PIRQ%d -> IRQ %d\n",
+                                                pin-16, irq);
+                        }
+                }
+        }
+        return irq;
+}
+static inline int IO_APIC_irq_trigger(int irq)
+{
+        int apic, idx, pin;
+        for (apic = 0; apic < nr_ioapics; apic++) {
+                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                        idx = find_irq_entry(apic,pin,mp_INT);
+                        if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+                                return irq_trigger(idx);
+                }
+        }
+        /*
+         * nonexistent IRQs are edge default
+         */
+        return 0;
+}
+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
+static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
+static int __assign_irq_vector(int irq)
+{
+        static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+        int vector, offset, i;
+        BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
+        if (irq_vector[irq] > 0)
+                return irq_vector[irq];
+        vector = current_vector;
+        offset = current_offset;
+next:
+        vector += 8;
+        if (vector >= FIRST_SYSTEM_VECTOR) {
+                offset = (offset + 1) % 8;
+                vector = FIRST_DEVICE_VECTOR + offset;
+        }
+        if (vector == current_vector)
+                return -ENOSPC;
+        if (vector == SYSCALL_VECTOR)
+                goto next;
+        for (i = 0; i < NR_IRQ_VECTORS; i++)
+                if (irq_vector[i] == vector)
+                        goto next;
+        current_vector = vector;
+        current_offset = offset;
+        irq_vector[irq] = vector;
+        return vector;
+}
+static int assign_irq_vector(int irq)
+{
+        unsigned long flags;
+        int vector;
+        spin_lock_irqsave(&vector_lock, flags);
+        vector = __assign_irq_vector(irq);
+        spin_unlock_irqrestore(&vector_lock, flags);
+        return vector;
+}
+static struct irq_chip ioapic_chip;
+#define IOAPIC_AUTO     -1
+#define IOAPIC_EDGE     0
+#define IOAPIC_LEVEL    1
+static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+{
+        if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+            trigger == IOAPIC_LEVEL) {
+                irq_desc[irq].status |= IRQ_LEVEL;
+                set_irq_chip_and_handler_name(irq, &ioapic_chip,
+                                         handle_fasteoi_irq, "fasteoi");
+        } else {
+                irq_desc[irq].status &= ~IRQ_LEVEL;
+                set_irq_chip_and_handler_name(irq, &ioapic_chip,
+                                         handle_edge_irq, "edge");
+        }
+        set_intr_gate(vector, interrupt[irq]);
+}
+static void __init setup_IO_APIC_irqs(void)
+{
+        struct IO_APIC_route_entry entry;
+        int apic, pin, idx, irq, first_notcon = 1, vector;
+        unsigned long flags;
+        apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+        for (apic = 0; apic < nr_ioapics; apic++) {
+        for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                /*
+                 * add it to the IO-APIC irq-routing table:
+                 */
+                memset(&entry,0,sizeof(entry));
+                entry.delivery_mode = INT_DELIVERY_MODE;
+                entry.dest_mode = INT_DEST_MODE;
+                entry.mask = 0;                         /* enable IRQ */
+                entry.dest.logical.logical_dest = 
+                                        cpu_mask_to_apicid(TARGET_CPUS);
+                idx = find_irq_entry(apic,pin,mp_INT);
+                if (idx == -1) {
+                        if (first_notcon) {
+                                apic_printk(APIC_VERBOSE, KERN_DEBUG
+                                                " IO-APIC (apicid-pin) %d-%d",
+                                                mp_ioapics[apic].mpc_apicid,
+                                                pin);
+                                first_notcon = 0;
+                        } else
+                                apic_printk(APIC_VERBOSE, ", %d-%d",
+                                        mp_ioapics[apic].mpc_apicid, pin);
+                        continue;
+                }
+                entry.trigger = irq_trigger(idx);
+                entry.polarity = irq_polarity(idx);
+                if (irq_trigger(idx)) {
+                        entry.trigger = 1;
+                        entry.mask = 1;
+                }
+                irq = pin_2_irq(idx, apic, pin);
+                /*
+                 * skip adding the timer int on secondary nodes, which causes
+                 * a small but painful rift in the time-space continuum
+                 */
+                if (multi_timer_check(apic, irq))
+                        continue;
+                else
+                        add_pin_to_irq(irq, apic, pin);
+                if (!apic && !IO_APIC_IRQ(irq))
+                        continue;
+                if (IO_APIC_IRQ(irq)) {
+                        vector = assign_irq_vector(irq);
+                        entry.vector = vector;
+                        ioapic_register_intr(irq, vector, IOAPIC_AUTO);
+                
+                        if (!apic && (irq < 16))
+                                disable_8259A_irq(irq);
+                }
+                spin_lock_irqsave(&ioapic_lock, flags);
+                __ioapic_write_entry(apic, pin, entry);
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+        }
+        }
+        if (!first_notcon)
+                apic_printk(APIC_VERBOSE, " not connected.\n");
+}
+/*
+ * Set up the 8259A-master output pin:
+ */
+static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+{
+        struct IO_APIC_route_entry entry;
+        memset(&entry,0,sizeof(entry));
+        disable_8259A_irq(0);
+        /* mask LVT0 */
+        apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+        /*
+         * We use logical delivery to get the timer IRQ
+         * to the first CPU.
+         */
+        entry.dest_mode = INT_DEST_MODE;
+        entry.mask = 0;                                 /* unmask IRQ now */
+        entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+        entry.delivery_mode = INT_DELIVERY_MODE;
+        entry.polarity = 0;
+        entry.trigger = 0;
+        entry.vector = vector;
+        /*
+         * The timer IRQ doesn't have to know that behind the
+         * scene we have a 8259A-master in AEOI mode ...
+         */
+        irq_desc[0].chip = &ioapic_chip;
+        set_irq_handler(0, handle_edge_irq);
+        /*
+         * Add it to the IO-APIC irq-routing table:
+         */
+        ioapic_write_entry(apic, pin, entry);
+        enable_8259A_irq(0);
+}
+void __init print_IO_APIC(void)
+{
+        int apic, i;
+        union IO_APIC_reg_00 reg_00;
+        union IO_APIC_reg_01 reg_01;
+        union IO_APIC_reg_02 reg_02;
+        union IO_APIC_reg_03 reg_03;
+        unsigned long flags;
+        if (apic_verbosity == APIC_QUIET)
+                return;
+        printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+        for (i = 0; i < nr_ioapics; i++)
+                printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+                       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+        /*
+         * We are a bit conservative about what we expect.  We have to
+         * know about every hardware change ASAP.
+         */
+        printk(KERN_INFO "testing the IO APIC.......................\n");
+        for (apic = 0; apic < nr_ioapics; apic++) {
+        spin_lock_irqsave(&ioapic_lock, flags);
+        reg_00.raw = io_apic_read(apic, 0);
+        reg_01.raw = io_apic_read(apic, 1);
+        if (reg_01.bits.version >= 0x10)
+                reg_02.raw = io_apic_read(apic, 2);
+        if (reg_01.bits.version >= 0x20)
+                reg_03.raw = io_apic_read(apic, 3);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+        printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+        printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
+        printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
+        printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
+        printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
+        printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
+        printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
+        printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
+        /*
+         * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+         * but the value of reg_02 is read as the previous read register
+         * value, so ignore it if reg_02 == reg_01.
+         */
+        if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
+                printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+                printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
+        }
+        /*
+         * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+         * or reg_03, but the value of reg_0[23] is read as the previous read
+         * register value, so ignore it if reg_03 == reg_0[12].
+         */
+        if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+            reg_03.raw != reg_01.raw) {
+                printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+                printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
+        }
+        printk(KERN_DEBUG ".... IRQ redirection table:\n");
+        printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
+                          " Stat Dest Deli Vect:   \n");
+        for (i = 0; i <= reg_01.bits.entries; i++) {
+                struct IO_APIC_route_entry entry;
+                entry = ioapic_read_entry(apic, i);
+                printk(KERN_DEBUG " %02x %03X %02X  ",
+                        i,
+                        entry.dest.logical.logical_dest,
+                        entry.dest.physical.physical_dest
+                );
+                printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
+                        entry.mask,
+                        entry.trigger,
+                        entry.irr,
+                        entry.polarity,
+                        entry.delivery_status,
+                        entry.dest_mode,
+                        entry.delivery_mode,
+                        entry.vector
+                );
+        }
+        }
+        printk(KERN_DEBUG "IRQ to pin mappings:\n");
+        for (i = 0; i < NR_IRQS; i++) {
+                struct irq_pin_list *entry = irq_2_pin + i;
+                if (entry->pin < 0)
+                        continue;
+                printk(KERN_DEBUG "IRQ%d ", i);
+                for (;;) {
+                        printk("-> %d:%d", entry->apic, entry->pin);
+                        if (!entry->next)
+                                break;
+                        entry = irq_2_pin + entry->next;
+                }
+                printk("\n");
+        }
+        printk(KERN_INFO ".................................... done.\n");
+        return;
+}
+#if 0
+static void print_APIC_bitfield (int base)
+{
+        unsigned int v;
+        int i, j;
+        if (apic_verbosity == APIC_QUIET)
+                return;
+        printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
+        for (i = 0; i < 8; i++) {
+                v = apic_read(base + i*0x10);
+                for (j = 0; j < 32; j++) {
+                        if (v & (1<<j))
+                                printk("1");
+                        else
+                                printk("0");
+                }
+                printk("\n");
+        }
+}
+void /*__init*/ print_local_APIC(void * dummy)
+{
+        unsigned int v, ver, maxlvt;
+        if (apic_verbosity == APIC_QUIET)
+                return;
+        printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+                smp_processor_id(), hard_smp_processor_id());
+        v = apic_read(APIC_ID);
+        printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
+        v = apic_read(APIC_LVR);
+        printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+        ver = GET_APIC_VERSION(v);
+        maxlvt = lapic_get_maxlvt();
+        v = apic_read(APIC_TASKPRI);
+        printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+        if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
+                v = apic_read(APIC_ARBPRI);
+                printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+                        v & APIC_ARBPRI_MASK);
+                v = apic_read(APIC_PROCPRI);
+                printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+        }
+        v = apic_read(APIC_EOI);
+        printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+        v = apic_read(APIC_RRR);
+        printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+        v = apic_read(APIC_LDR);
+        printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+        v = apic_read(APIC_DFR);
+        printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+        v = apic_read(APIC_SPIV);
+        printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+        printk(KERN_DEBUG "... APIC ISR field:\n");
+        print_APIC_bitfield(APIC_ISR);
+        printk(KERN_DEBUG "... APIC TMR field:\n");
+        print_APIC_bitfield(APIC_TMR);
+        printk(KERN_DEBUG "... APIC IRR field:\n");
+        print_APIC_bitfield(APIC_IRR);
+        if (APIC_INTEGRATED(ver)) {             /* !82489DX */
+                if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
+                        apic_write(APIC_ESR, 0);
+                v = apic_read(APIC_ESR);
+                printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+        }
+        v = apic_read(APIC_ICR);
+        printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
+        v = apic_read(APIC_ICR2);
+        printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+        v = apic_read(APIC_LVTT);
+        printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+        if (maxlvt > 3) {                       /* PC is LVT#4. */
+                v = apic_read(APIC_LVTPC);
+                printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+        }
+        v = apic_read(APIC_LVT0);
+        printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+        v = apic_read(APIC_LVT1);
+        printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+        if (maxlvt > 2) {                       /* ERR is LVT#3. */
+                v = apic_read(APIC_LVTERR);
+                printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+        }
+        v = apic_read(APIC_TMICT);
+        printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+        v = apic_read(APIC_TMCCT);
+        printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+        v = apic_read(APIC_TDCR);
+        printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+        printk("\n");
+}
+void print_all_local_APICs (void)
+{
+        on_each_cpu(print_local_APIC, NULL, 1, 1);
+}
+void /*__init*/ print_PIC(void)
+{
+        unsigned int v;
+        unsigned long flags;
+        if (apic_verbosity == APIC_QUIET)
+                return;
+        printk(KERN_DEBUG "\nprinting PIC contents\n");
+        spin_lock_irqsave(&i8259A_lock, flags);
+        v = inb(0xa1) << 8 | inb(0x21);
+        printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
+        v = inb(0xa0) << 8 | inb(0x20);
+        printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
+        outb(0x0b,0xa0);
+        outb(0x0b,0x20);
+        v = inb(0xa0) << 8 | inb(0x20);
+        outb(0x0a,0xa0);
+        outb(0x0a,0x20);
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+        printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
+        v = inb(0x4d1) << 8 | inb(0x4d0);
+        printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+#endif  /*  0  */
+static void __init enable_IO_APIC(void)
+{
+        union IO_APIC_reg_01 reg_01;
+        int i8259_apic, i8259_pin;
+        int i, apic;
+        unsigned long flags;
+        for (i = 0; i < PIN_MAP_SIZE; i++) {
+                irq_2_pin[i].pin = -1;
+                irq_2_pin[i].next = 0;
+        }
+        if (!pirqs_enabled)
+                for (i = 0; i < MAX_PIRQS; i++)
+                        pirq_entries[i] = -1;
+        /*
+         * The number of IO-APIC IRQ registers (== #pins):
+         */
+        for (apic = 0; apic < nr_ioapics; apic++) {
+                spin_lock_irqsave(&ioapic_lock, flags);
+                reg_01.raw = io_apic_read(apic, 1);
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+                nr_ioapic_registers[apic] = reg_01.bits.entries+1;
+        }
+        for(apic = 0; apic < nr_ioapics; apic++) {
+                int pin;
+                /* See if any of the pins is in ExtINT mode */
+                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                        struct IO_APIC_route_entry entry;
+                        entry = ioapic_read_entry(apic, pin);
+                        /* If the interrupt line is enabled and in ExtInt mode
+                         * I have found the pin where the i8259 is connected.
+                         */
+                        if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+                                ioapic_i8259.apic = apic;
+                                ioapic_i8259.pin  = pin;
+                                goto found_i8259;
+                        }
+                }
+        }
+ found_i8259:
+        /* Look to see what if the MP table has reported the ExtINT */
+        /* If we could not find the appropriate pin by looking at the ioapic
+         * the i8259 probably is not connected the ioapic but give the
+         * mptable a chance anyway.
+         */
+        i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
+        i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
+        /* Trust the MP table if nothing is setup in the hardware */
+        if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
+                printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+                ioapic_i8259.pin  = i8259_pin;
+                ioapic_i8259.apic = i8259_apic;
+        }
+        /* Complain if the MP table and the hardware disagree */
+        if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
+                (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+        {
+                printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
+        }
+        /*
+         * Do not trust the IO-APIC being empty at bootup
+         */
+        clear_IO_APIC();
+}
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+        /*
+         * Clear the IO-APIC before rebooting:
+         */
+        clear_IO_APIC();
+        /*
+         * If the i8259 is routed through an IOAPIC
+         * Put that IOAPIC in virtual wire mode
+         * so legacy interrupts can be delivered.
+         */
+        if (ioapic_i8259.pin != -1) {
+                struct IO_APIC_route_entry entry;
+                memset(&entry, 0, sizeof(entry));
+                entry.mask            = 0; /* Enabled */
+                entry.trigger         = 0; /* Edge */
+                entry.irr             = 0;
+                entry.polarity        = 0; /* High */
+                entry.delivery_status = 0;
+                entry.dest_mode       = 0; /* Physical */
+                entry.delivery_mode   = dest_ExtINT; /* ExtInt */
+                entry.vector          = 0;
+                entry.dest.physical.physical_dest =
+                                        GET_APIC_ID(apic_read(APIC_ID));
+                /*
+                 * Add it to the IO-APIC irq-routing table:
+                 */
+                ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
+        }
+        disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+}
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
+ */
+#ifndef CONFIG_X86_NUMAQ
+static void __init setup_ioapic_ids_from_mpc(void)
+{
+        union IO_APIC_reg_00 reg_00;
+        physid_mask_t phys_id_present_map;
+        int apic;
+        int i;
+        unsigned char old_id;
+        unsigned long flags;
+        /*
+         * Don't check I/O APIC IDs for xAPIC systems.  They have
+         * no meaning without the serial APIC bus.
+         */
+        if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+                || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+                return;
+        /*
+         * This is broken; anything with a real cpu count has to
+         * circumvent this idiocy regardless.
+         */
+        phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
+        /*
+         * Set the IOAPIC ID to the value stored in the MPC table.
+         */
+        for (apic = 0; apic < nr_ioapics; apic++) {
+                /* Read the register 0 value */
+                spin_lock_irqsave(&ioapic_lock, flags);
+                reg_00.raw = io_apic_read(apic, 0);
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+                
+                old_id = mp_ioapics[apic].mpc_apicid;
+                if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
+                        printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+                                apic, mp_ioapics[apic].mpc_apicid);
+                        printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+                                reg_00.bits.ID);
+                        mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
+                }
+                /*
+                 * Sanity check, is the ID really free? Every APIC in a
+                 * system must have a unique ID or we get lots of nice
+                 * 'stuck on smp_invalidate_needed IPI wait' messages.
+                 */
+                if (check_apicid_used(phys_id_present_map,
+                                        mp_ioapics[apic].mpc_apicid)) {
+                        printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+                                apic, mp_ioapics[apic].mpc_apicid);
+                        for (i = 0; i < get_physical_broadcast(); i++)
+                                if (!physid_isset(i, phys_id_present_map))
+                                        break;
+                        if (i >= get_physical_broadcast())
+                                panic("Max APIC ID exceeded!\n");
+                        printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+                                i);
+                        physid_set(i, phys_id_present_map);
+                        mp_ioapics[apic].mpc_apicid = i;
+                } else {
+                        physid_mask_t tmp;
+                        tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
+                        apic_printk(APIC_VERBOSE, "Setting %d in the "
+                                        "phys_id_present_map\n",
+                                        mp_ioapics[apic].mpc_apicid);
+                        physids_or(phys_id_present_map, phys_id_present_map, tmp);
+                }
+                /*
+                 * We need to adjust the IRQ routing table
+                 * if the ID changed.
+                 */
+                if (old_id != mp_ioapics[apic].mpc_apicid)
+                        for (i = 0; i < mp_irq_entries; i++)
+                                if (mp_irqs[i].mpc_dstapic == old_id)
+                                        mp_irqs[i].mpc_dstapic
+                                                = mp_ioapics[apic].mpc_apicid;
+                /*
+                 * Read the right value from the MPC table and
+                 * write it into the ID register.
+                 */
+                apic_printk(APIC_VERBOSE, KERN_INFO
+                        "...changing IO-APIC physical APIC ID to %d ...",
+                        mp_ioapics[apic].mpc_apicid);
+                reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+                spin_lock_irqsave(&ioapic_lock, flags);
+                io_apic_write(apic, 0, reg_00.raw);
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+                /*
+                 * Sanity check
+                 */
+                spin_lock_irqsave(&ioapic_lock, flags);
+                reg_00.raw = io_apic_read(apic, 0);
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+                if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+                        printk("could not set ID!\n");
+                else
+                        apic_printk(APIC_VERBOSE, " ok.\n");
+        }
+}
+#else
+static void __init setup_ioapic_ids_from_mpc(void) { }
+#endif
+int no_timer_check __initdata;
+static int __init notimercheck(char *s)
+{
+        no_timer_check = 1;
+        return 1;
+}
+__setup("no_timer_check", notimercheck);
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ *      - timer IRQ defaults to IO-APIC IRQ
+ *      - if this function detects that timer IRQs are defunct, then we fall
+ *        back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+        unsigned long t1 = jiffies;
+        if (no_timer_check)
+                return 1;
+        local_irq_enable();
+        /* Let ten ticks pass... */
+        mdelay((10 * 1000) / HZ);
+        /*
+         * Expect a few ticks at least, to be sure some possible
+         * glue logic does not lock up after one or two first
+         * ticks in a non-ExtINT mode.  Also the local APIC
+         * might have cached one ExtINT interrupt.  Finally, at
+         * least one tick may be lost due to delays.
+         */
+        if (jiffies - t1 > 4)
+                return 1;
+        return 0;
+}
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+/*
+ * Startup quirk:
+ *
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ *
+ * (We do this for level-triggered IRQs too - it cannot hurt.)
+ */
+static unsigned int startup_ioapic_irq(unsigned int irq)
+{
+        int was_pending = 0;
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        if (irq < 16) {
+                disable_8259A_irq(irq);
+                if (i8259A_irq_pending(irq))
+                        was_pending = 1;
+        }
+        __unmask_IO_APIC_irq(irq);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return was_pending;
+}
+static void ack_ioapic_irq(unsigned int irq)
+{
+        move_native_irq(irq);
+        ack_APIC_irq();
+}
+static void ack_ioapic_quirk_irq(unsigned int irq)
+{
+        unsigned long v;
+        int i;
+        move_native_irq(irq);
+/*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets).  Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless.  As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source.  The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually.  We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt.  We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul.  --macro
+ */
+        i = irq_vector[irq];
+        v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+        ack_APIC_irq();
+        if (!(v & (1 << (i & 0x1f)))) {
+                atomic_inc(&irq_mis_count);
+                spin_lock(&ioapic_lock);
+                __mask_and_edge_IO_APIC_irq(irq);
+                __unmask_and_level_IO_APIC_irq(irq);
+                spin_unlock(&ioapic_lock);
+        }
+}
+static int ioapic_retrigger_irq(unsigned int irq)
+{
+        send_IPI_self(irq_vector[irq]);
+        return 1;
+}
+static struct irq_chip ioapic_chip __read_mostly = {
+        .name           = "IO-APIC",
+        .startup        = startup_ioapic_irq,
+        .mask           = mask_IO_APIC_irq,
+        .unmask         = unmask_IO_APIC_irq,
+        .ack            = ack_ioapic_irq,
+        .eoi            = ack_ioapic_quirk_irq,
+#ifdef CONFIG_SMP
+        .set_affinity   = set_ioapic_affinity_irq,
+#endif
+        .retrigger      = ioapic_retrigger_irq,
+};
+static inline void init_IO_APIC_traps(void)
+{
+        int irq;
+        /*
+         * NOTE! The local APIC isn't very good at handling
+         * multiple interrupts at the same interrupt level.
+         * As the interrupt level is determined by taking the
+         * vector number and shifting that right by 4, we
+         * want to spread these out a bit so that they don't
+         * all fall in the same interrupt level.
+         *
+         * Also, we've got to be careful not to trash gate
+         * 0x80, because int 0x80 is hm, kind of importantish. ;)
+         */
+        for (irq = 0; irq < NR_IRQS ; irq++) {
+                int tmp = irq;
+                if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
+                        /*
+                         * Hmm.. We don't have an entry for this,
+                         * so default to an old-fashioned 8259
+                         * interrupt if we can..
+                         */
+                        if (irq < 16)
+                                make_8259A_irq(irq);
+                        else
+                                /* Strange. Oh, well.. */
+                                irq_desc[irq].chip = &no_irq_chip;
+                }
+        }
+}
+/*
+ * The local APIC irq-chip implementation:
+ */
+static void ack_apic(unsigned int irq)
+{
+        ack_APIC_irq();
+}
+static void mask_lapic_irq (unsigned int irq)
+{
+        unsigned long v;
+        v = apic_read(APIC_LVT0);
+        apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+static void unmask_lapic_irq (unsigned int irq)
+{
+        unsigned long v;
+        v = apic_read(APIC_LVT0);
+        apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+static struct irq_chip lapic_chip __read_mostly = {
+        .name           = "local-APIC-edge",
+        .mask           = mask_lapic_irq,
+        .unmask         = unmask_lapic_irq,
+        .eoi            = ack_apic,
+};
+static void setup_nmi (void)
+{
+        /*
+         * Dirty trick to enable the NMI watchdog ...
+         * We put the 8259A master into AEOI mode and
+         * unmask on all local APICs LVT0 as NMI.
+         *
+         * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
+         * is from Maciej W. Rozycki - so we do not have to EOI from
+         * the NMI handler or the timer interrupt.
+         */ 
+        apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
+        on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
+        apic_printk(APIC_VERBOSE, " done.\n");
+}
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic.  ICR does
+ * not support the ExtINT mode, unfortunately.  We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA.  --macro
+ */
+static inline void unlock_ExtINT_logic(void)
+{
+        int apic, pin, i;
+        struct IO_APIC_route_entry entry0, entry1;
+        unsigned char save_control, save_freq_select;
+        pin  = find_isa_irq_pin(8, mp_INT);
+        if (pin == -1) {
+                WARN_ON_ONCE(1);
+                return;
+        }
+        apic = find_isa_irq_apic(8, mp_INT);
+        if (apic == -1) {
+                WARN_ON_ONCE(1);
+                return;
+        }
+        entry0 = ioapic_read_entry(apic, pin);
+        clear_IO_APIC_pin(apic, pin);
+        memset(&entry1, 0, sizeof(entry1));
+        entry1.dest_mode = 0;                   /* physical delivery */
+        entry1.mask = 0;                        /* unmask IRQ now */
+        entry1.dest.physical.physical_dest = hard_smp_processor_id();
+        entry1.delivery_mode = dest_ExtINT;
+        entry1.polarity = entry0.polarity;
+        entry1.trigger = 0;
+        entry1.vector = 0;
+        ioapic_write_entry(apic, pin, entry1);
+        save_control = CMOS_READ(RTC_CONTROL);
+        save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+        CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+                   RTC_FREQ_SELECT);
+        CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+        i = 100;
+        while (i-- > 0) {
+                mdelay(10);
+                if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+                        i -= 10;
+        }
+        CMOS_WRITE(save_control, RTC_CONTROL);
+        CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+        clear_IO_APIC_pin(apic, pin);
+        ioapic_write_entry(apic, pin, entry0);
+}
+int timer_uses_ioapic_pin_0;
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
+ * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ */
+static inline void __init check_timer(void)
+{
+        int apic1, pin1, apic2, pin2;
+        int vector;
+        /*
+         * get/set the timer IRQ vector:
+         */
+        disable_8259A_irq(0);
+        vector = assign_irq_vector(0);
+        set_intr_gate(vector, interrupt[0]);
+        /*
+         * Subtle, code in do_timer_interrupt() expects an AEOI
+         * mode for the 8259A whenever interrupts are routed
+         * through I/O APICs.  Also IRQ0 has to be enabled in
+         * the 8259A which implies the virtual wire has to be
+         * disabled in the local APIC.
+         */
+        apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+        init_8259A(1);
+        timer_ack = 1;
+        if (timer_over_8254 > 0)
+                enable_8259A_irq(0);
+        pin1  = find_isa_irq_pin(0, mp_INT);
+        apic1 = find_isa_irq_apic(0, mp_INT);
+        pin2  = ioapic_i8259.pin;
+        apic2 = ioapic_i8259.apic;
+        if (pin1 == 0)
+                timer_uses_ioapic_pin_0 = 1;
+        printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
+                vector, apic1, pin1, apic2, pin2);
+        if (pin1 != -1) {
+                /*
+                 * Ok, does IRQ0 through the IOAPIC work?
+                 */
+                unmask_IO_APIC_irq(0);
+                if (timer_irq_works()) {
+                        if (nmi_watchdog == NMI_IO_APIC) {
+                                disable_8259A_irq(0);
+                                setup_nmi();
+                                enable_8259A_irq(0);
+                        }
+                        if (disable_timer_pin_1 > 0)
+                                clear_IO_APIC_pin(0, pin1);
+                        return;
+                }
+                clear_IO_APIC_pin(apic1, pin1);
+                printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
+                                "IO-APIC\n");
+        }
+        printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
+        if (pin2 != -1) {
+                printk("\n..... (found pin %d) ...", pin2);
+                /*
+                 * legacy devices should be connected to IO APIC #0
+                 */
+                setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
+                if (timer_irq_works()) {
+                        printk("works.\n");
+                        if (pin1 != -1)
+                                replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+                        else
+                                add_pin_to_irq(0, apic2, pin2);
+                        if (nmi_watchdog == NMI_IO_APIC) {
+                                setup_nmi();
+                        }
+                        return;
+                }
+                /*
+                 * Cleanup, just in case ...
+                 */
+                clear_IO_APIC_pin(apic2, pin2);
+        }
+        printk(" failed.\n");
+        if (nmi_watchdog == NMI_IO_APIC) {
+                printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+                nmi_watchdog = 0;
+        }
+        printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+        disable_8259A_irq(0);
+        set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
+                                      "fasteoi");
+        apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);   /* Fixed mode */
+        enable_8259A_irq(0);
+        if (timer_irq_works()) {
+                printk(" works.\n");
+                return;
+        }
+        apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+        printk(" failed.\n");
+        printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+        timer_ack = 0;
+        init_8259A(0);
+        make_8259A_irq(0);
+        apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+        unlock_ExtINT_logic();
+        if (timer_irq_works()) {
+                printk(" works.\n");
+                return;
+        }
+        printk(" failed :(.\n");
+        panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
+                "report.  Then try booting with the 'noapic' option");
+}
+/*
+ *
+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
+ *   Linux doesn't really care, as it's not actually used
+ *   for any interrupt handling anyway.
+ */
+#define PIC_IRQS        (1 << PIC_CASCADE_IR)
+void __init setup_IO_APIC(void)
+{
+        enable_IO_APIC();
+        if (acpi_ioapic)
+                io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
+        else
+                io_apic_irqs = ~PIC_IRQS;
+        printk("ENABLING IO-APIC IRQs\n");
+        /*
+         * Set up IO-APIC IRQ routing.
+         */
+        if (!acpi_ioapic)
+                setup_ioapic_ids_from_mpc();
+        sync_Arb_IDs();
+        setup_IO_APIC_irqs();
+        init_IO_APIC_traps();
+        check_timer();
+        if (!acpi_ioapic)
+                print_IO_APIC();
+}
+static int __init setup_disable_8254_timer(char *s)
+{
+        timer_over_8254 = -1;
+        return 1;
+}
+static int __init setup_enable_8254_timer(char *s)
+{
+        timer_over_8254 = 2;
+        return 1;
+}
+__setup("disable_8254_timer", setup_disable_8254_timer);
+__setup("enable_8254_timer", setup_enable_8254_timer);
+/*
+ *      Called after all the initialization is done. If we didnt find any
+ *      APIC bugs then we can allow the modify fast path
+ */
+ 
+static int __init io_apic_bug_finalize(void)
+{
+        if(sis_apic_bug == -1)
+                sis_apic_bug = 0;
+        return 0;
+}
+late_initcall(io_apic_bug_finalize);
+struct sysfs_ioapic_data {
+        struct sys_device dev;
+        struct IO_APIC_route_entry entry[0];
+};
+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+        struct IO_APIC_route_entry *entry;
+        struct sysfs_ioapic_data *data;
+        int i;
+        
+        data = container_of(dev, struct sysfs_ioapic_data, dev);
+        entry = data->entry;
+        for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+                entry[i] = ioapic_read_entry(dev->id, i);
+        return 0;
+}
+static int ioapic_resume(struct sys_device *dev)
+{
+        struct IO_APIC_route_entry *entry;
+        struct sysfs_ioapic_data *data;
+        unsigned long flags;
+        union IO_APIC_reg_00 reg_00;
+        int i;
+        
+        data = container_of(dev, struct sysfs_ioapic_data, dev);
+        entry = data->entry;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        reg_00.raw = io_apic_read(dev->id, 0);
+        if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
+                reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+                io_apic_write(dev->id, 0, reg_00.raw);
+        }
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+                ioapic_write_entry(dev->id, i, entry[i]);
+        return 0;
+}
+static struct sysdev_class ioapic_sysdev_class = {
+        set_kset_name("ioapic"),
+        .suspend = ioapic_suspend,
+        .resume = ioapic_resume,
+};
+static int __init ioapic_init_sysfs(void)
+{
+        struct sys_device * dev;
+        int i, size, error = 0;
+        error = sysdev_class_register(&ioapic_sysdev_class);
+        if (error)
+                return error;
+        for (i = 0; i < nr_ioapics; i++ ) {
+                size = sizeof(struct sys_device) + nr_ioapic_registers[i] 
+                        * sizeof(struct IO_APIC_route_entry);
+                mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+                if (!mp_ioapic_data[i]) {
+                        printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+                        continue;
+                }
+                memset(mp_ioapic_data[i], 0, size);
+                dev = &mp_ioapic_data[i]->dev;
+                dev->id = i; 
+                dev->cls = &ioapic_sysdev_class;
+                error = sysdev_register(dev);
+                if (error) {
+                        kfree(mp_ioapic_data[i]);
+                        mp_ioapic_data[i] = NULL;
+                        printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+                        continue;
+                }
+        }
+        return 0;
+}
+device_initcall(ioapic_init_sysfs);
+/*
+ * Dynamic irq allocate and deallocation
+ */
+int create_irq(void)
+{
+        /* Allocate an unused irq */
+        int irq, new, vector = 0;
+        unsigned long flags;
+        irq = -ENOSPC;
+        spin_lock_irqsave(&vector_lock, flags);
+        for (new = (NR_IRQS - 1); new >= 0; new--) {
+                if (platform_legacy_irq(new))
+                        continue;
+                if (irq_vector[new] != 0)
+                        continue;
+                vector = __assign_irq_vector(new);
+                if (likely(vector > 0))
+                        irq = new;
+                break;
+        }
+        spin_unlock_irqrestore(&vector_lock, flags);
+        if (irq >= 0) {
+                set_intr_gate(vector, interrupt[irq]);
+                dynamic_irq_init(irq);
+        }
+        return irq;
+}
+void destroy_irq(unsigned int irq)
+{
+        unsigned long flags;
+        dynamic_irq_cleanup(irq);
+        spin_lock_irqsave(&vector_lock, flags);
+        irq_vector[irq] = 0;
+        spin_unlock_irqrestore(&vector_lock, flags);
+}
+/*
+ * MSI mesage composition
+ */
+#ifdef CONFIG_PCI_MSI
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+        int vector;
+        unsigned dest;
+        vector = assign_irq_vector(irq);
+        if (vector >= 0) {
+                dest = cpu_mask_to_apicid(TARGET_CPUS);
+                msg->address_hi = MSI_ADDR_BASE_HI;
+                msg->address_lo =
+                        MSI_ADDR_BASE_LO |
+                        ((INT_DEST_MODE == 0) ?
+                                MSI_ADDR_DEST_MODE_PHYSICAL:
+                                MSI_ADDR_DEST_MODE_LOGICAL) |
+                        ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+                                MSI_ADDR_REDIRECTION_CPU:
+                                MSI_ADDR_REDIRECTION_LOWPRI) |
+                        MSI_ADDR_DEST_ID(dest);
+                msg->data =
+                        MSI_DATA_TRIGGER_EDGE |
+                        MSI_DATA_LEVEL_ASSERT |
+                        ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+                                MSI_DATA_DELIVERY_FIXED:
+                                MSI_DATA_DELIVERY_LOWPRI) |
+                        MSI_DATA_VECTOR(vector);
+        }
+        return vector;
+}
+#ifdef CONFIG_SMP
+static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+        struct msi_msg msg;
+        unsigned int dest;
+        cpumask_t tmp;
+        int vector;
+        cpus_and(tmp, mask, cpu_online_map);
+        if (cpus_empty(tmp))
+                tmp = TARGET_CPUS;
+        vector = assign_irq_vector(irq);
+        if (vector < 0)
+                return;
+        dest = cpu_mask_to_apicid(mask);
+        read_msi_msg(irq, &msg);
+        msg.data &= ~MSI_DATA_VECTOR_MASK;
+        msg.data |= MSI_DATA_VECTOR(vector);
+        msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+        write_msi_msg(irq, &msg);
+        irq_desc[irq].affinity = mask;
+}
+#endif /* CONFIG_SMP */
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip msi_chip = {
+        .name           = "PCI-MSI",
+        .unmask         = unmask_msi_irq,
+        .mask           = mask_msi_irq,
+        .ack            = ack_ioapic_irq,
+#ifdef CONFIG_SMP
+        .set_affinity   = set_msi_irq_affinity,
+#endif
+        .retrigger      = ioapic_retrigger_irq,
+};
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
+        struct msi_msg msg;
+        int irq, ret;
+        irq = create_irq();
+        if (irq < 0)
+                return irq;
+        ret = msi_compose_msg(dev, irq, &msg);
+        if (ret < 0) {
+                destroy_irq(irq);
+                return ret;
+        }
+        set_irq_msi(irq, desc);
+        write_msi_msg(irq, &msg);
+        set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
+                                      "edge");
+        return 0;
+}
+void arch_teardown_msi_irq(unsigned int irq)
+{
+        destroy_irq(irq);
+}
+#endif /* CONFIG_PCI_MSI */
+/*
+ * Hypertransport interrupt support
+ */
+#ifdef CONFIG_HT_IRQ
+#ifdef CONFIG_SMP
+static void target_ht_irq(unsigned int irq, unsigned int dest)
+{
+        struct ht_irq_msg msg;
+        fetch_ht_irq_msg(irq, &msg);
+        msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
+        msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+        msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
+        msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+        write_ht_irq_msg(irq, &msg);
+}
+static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+        unsigned int dest;
+        cpumask_t tmp;
+        cpus_and(tmp, mask, cpu_online_map);
+        if (cpus_empty(tmp))
+                tmp = TARGET_CPUS;
+        cpus_and(mask, tmp, CPU_MASK_ALL);
+        dest = cpu_mask_to_apicid(mask);
+        target_ht_irq(irq, dest);
+        irq_desc[irq].affinity = mask;
+}
+#endif
+static struct irq_chip ht_irq_chip = {
+        .name           = "PCI-HT",
+        .mask           = mask_ht_irq,
+        .unmask         = unmask_ht_irq,
+        .ack            = ack_ioapic_irq,
+#ifdef CONFIG_SMP
+        .set_affinity   = set_ht_irq_affinity,
+#endif
+        .retrigger      = ioapic_retrigger_irq,
+};
+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+{
+        int vector;
+        vector = assign_irq_vector(irq);
+        if (vector >= 0) {
+                struct ht_irq_msg msg;
+                unsigned dest;
+                cpumask_t tmp;
+                cpus_clear(tmp);
+                cpu_set(vector >> 8, tmp);
+                dest = cpu_mask_to_apicid(tmp);
+                msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+                msg.address_lo =
+                        HT_IRQ_LOW_BASE |
+                        HT_IRQ_LOW_DEST_ID(dest) |
+                        HT_IRQ_LOW_VECTOR(vector) |
+                        ((INT_DEST_MODE == 0) ?
+                                HT_IRQ_LOW_DM_PHYSICAL :
+                                HT_IRQ_LOW_DM_LOGICAL) |
+                        HT_IRQ_LOW_RQEOI_EDGE |
+                        ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+                                HT_IRQ_LOW_MT_FIXED :
+                                HT_IRQ_LOW_MT_ARBITRATED) |
+                        HT_IRQ_LOW_IRQ_MASKED;
+                write_ht_irq_msg(irq, &msg);
+                set_irq_chip_and_handler_name(irq, &ht_irq_chip,
+                                              handle_edge_irq, "edge");
+        }
+        return vector;
+}
+#endif /* CONFIG_HT_IRQ */
+/* --------------------------------------------------------------------------
+                          ACPI-based IOAPIC Configuration
+   -------------------------------------------------------------------------- */
+#ifdef CONFIG_ACPI
+int __init io_apic_get_unique_id (int ioapic, int apic_id)
+{
+        union IO_APIC_reg_00 reg_00;
+        static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+        physid_mask_t tmp;
+        unsigned long flags;
+        int i = 0;
+        /*
+         * The P4 platform supports up to 256 APIC IDs on two separate APIC 
+         * buses (one for LAPICs, one for IOAPICs), where predecessors only 
+         * supports up to 16 on one shared APIC bus.
+         * 
+         * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+         *      advantage of new APIC bus architecture.
+         */
+        if (physids_empty(apic_id_map))
+                apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
+        spin_lock_irqsave(&ioapic_lock, flags);
+        reg_00.raw = io_apic_read(ioapic, 0);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        if (apic_id >= get_physical_broadcast()) {
+                printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+                        "%d\n", ioapic, apic_id, reg_00.bits.ID);
+                apic_id = reg_00.bits.ID;
+        }
+        /*
+         * Every APIC in a system must have a unique ID or we get lots of nice 
+         * 'stuck on smp_invalidate_needed IPI wait' messages.
+         */
+        if (check_apicid_used(apic_id_map, apic_id)) {
+                for (i = 0; i < get_physical_broadcast(); i++) {
+                        if (!check_apicid_used(apic_id_map, i))
+                                break;
+                }
+                if (i == get_physical_broadcast())
+                        panic("Max apic_id exceeded!\n");
+                printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+                        "trying %d\n", ioapic, apic_id, i);
+                apic_id = i;
+        } 
+        tmp = apicid_to_cpu_present(apic_id);
+        physids_or(apic_id_map, apic_id_map, tmp);
+        if (reg_00.bits.ID != apic_id) {
+                reg_00.bits.ID = apic_id;
+                spin_lock_irqsave(&ioapic_lock, flags);
+                io_apic_write(ioapic, 0, reg_00.raw);
+                reg_00.raw = io_apic_read(ioapic, 0);
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+                /* Sanity check */
+                if (reg_00.bits.ID != apic_id) {
+                        printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+                        return -1;
+                }
+        }
+        apic_printk(APIC_VERBOSE, KERN_INFO
+                        "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+        return apic_id;
+}
+int __init io_apic_get_version (int ioapic)
+{
+        union IO_APIC_reg_01    reg_01;
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        reg_01.raw = io_apic_read(ioapic, 1);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return reg_01.bits.version;
+}
+int __init io_apic_get_redir_entries (int ioapic)
+{
+        union IO_APIC_reg_01    reg_01;
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        reg_01.raw = io_apic_read(ioapic, 1);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return reg_01.bits.entries;
+}
+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+{
+        struct IO_APIC_route_entry entry;
+        unsigned long flags;
+        if (!IO_APIC_IRQ(irq)) {
+                printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+                        ioapic);
+                return -EINVAL;
+        }
+        /*
+         * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
+         * Note that we mask (disable) IRQs now -- these get enabled when the
+         * corresponding device driver registers for this IRQ.
+         */
+        memset(&entry,0,sizeof(entry));
+        entry.delivery_mode = INT_DELIVERY_MODE;
+        entry.dest_mode = INT_DEST_MODE;
+        entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+        entry.trigger = edge_level;
+        entry.polarity = active_high_low;
+        entry.mask  = 1;
+        /*
+         * IRQs < 16 are already in the irq_2_pin[] map
+         */
+        if (irq >= 16)
+                add_pin_to_irq(irq, ioapic, pin);
+        entry.vector = assign_irq_vector(irq);
+        apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
+                "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
+                mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+                edge_level, active_high_low);
+        ioapic_register_intr(irq, entry.vector, edge_level);
+        if (!ioapic && (irq < 16))
+                disable_8259A_irq(irq);
+        spin_lock_irqsave(&ioapic_lock, flags);
+        __ioapic_write_entry(ioapic, pin, entry);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return 0;
+}
+#endif /* CONFIG_ACPI */
+static int __init parse_disable_timer_pin_1(char *arg)
+{
+        disable_timer_pin_1 = 1;
+        return 0;
+}
+early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
+static int __init parse_enable_timer_pin_1(char *arg)
+{
+        disable_timer_pin_1 = -1;
+        return 0;
+}
+early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
+static int __init parse_noapic(char *arg)
+{
+        /* disable IO-APIC */
+        disable_ioapic_setup();
+        return 0;
+}
+early_param("noapic", parse_noapic);
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
new file mode 100644
index 000000000000..966fa1062491
--- /dev/null
+++ b/arch/x86/kernel/io_apic_64.c
@@ -0,0 +1,2202 @@
+/*
+ *      Intel IO-APIC support for multi-Pentium hosts.
+ *
+ *      Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ *
+ *      Many thanks to Stig Venaas for trying out countless experimental
+ *      patches and reporting/debugging problems patiently!
+ *
+ *      (c) 1999, Multiple IO-APIC support, developed by
+ *      Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ *      further tested and cleaned up by Zach Brown <zab@redhat.com>
+ *      and Ingo Molnar <mingo@redhat.com>
+ *
+ *      Fixes
+ *      Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
+ *                                      thanks to Eric Gilmore
+ *                                      and Rolf G. Tews
+ *                                      for testing these extensively
+ *      Paul Diefenbaugh        :       Added full ACPI support
+ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
+#include <linux/sysdev.h>
+#include <linux/msi.h>
+#include <linux/htirq.h>
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+#include <asm/idle.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/mach_apic.h>
+#include <asm/acpi.h>
+#include <asm/dma.h>
+#include <asm/nmi.h>
+#include <asm/msidef.h>
+#include <asm/hypertransport.h>
+struct irq_cfg {
+        cpumask_t domain;
+        cpumask_t old_domain;
+        unsigned move_cleanup_count;
+        u8 vector;
+        u8 move_in_progress : 1;
+};
+/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
+        [0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+        [1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+        [2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+        [3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+        [4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+        [5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+        [6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+        [7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+        [8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+        [9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+        [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+        [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+        [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+        [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+        [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+        [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+};
+static int assign_irq_vector(int irq, cpumask_t mask);
+#define __apicdebuginit  __init
+int sis_apic_bug; /* not actually supported, dummy for compile */
+static int no_timer_check;
+static int disable_timer_pin_1 __initdata;
+int timer_over_8254 __initdata = 1;
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+static DEFINE_SPINLOCK(ioapic_lock);
+DEFINE_SPINLOCK(vector_lock);
+/*
+ * # of IRQ routing registers
+ */
+int nr_ioapic_registers[MAX_IO_APICS];
+/*
+ * Rough estimation of how many shared IRQs there are, can
+ * be changed anytime.
+ */
+#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+static struct irq_pin_list {
+        short apic, pin, next;
+} irq_2_pin[PIN_MAP_SIZE];
+struct io_apic {
+        unsigned int index;
+        unsigned int unused[3];
+        unsigned int data;
+};
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+        return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+                + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+}
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+        struct io_apic __iomem *io_apic = io_apic_base(apic);
+        writel(reg, &io_apic->index);
+        return readl(&io_apic->data);
+}
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+        struct io_apic __iomem *io_apic = io_apic_base(apic);
+        writel(reg, &io_apic->index);
+        writel(value, &io_apic->data);
+}
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ */
+static inline void io_apic_modify(unsigned int apic, unsigned int value)
+{
+        struct io_apic __iomem *io_apic = io_apic_base(apic);
+        writel(value, &io_apic->data);
+}
+static int io_apic_level_ack_pending(unsigned int irq)
+{
+        struct irq_pin_list *entry;
+        unsigned long flags;
+        int pending = 0;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        entry = irq_2_pin + irq;
+        for (;;) {
+                unsigned int reg;
+                int pin;
+                pin = entry->pin;
+                if (pin == -1)
+                        break;
+                reg = io_apic_read(entry->apic, 0x10 + pin*2);
+                /* Is the remote IRR bit set? */
+                pending |= (reg >> 14) & 1;
+                if (!entry->next)
+                        break;
+                entry = irq_2_pin + entry->next;
+        }
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return pending;
+}
+/*
+ * Synchronize the IO-APIC and the CPU by doing
+ * a dummy read from the IO-APIC
+ */
+static inline void io_apic_sync(unsigned int apic)
+{
+        struct io_apic __iomem *io_apic = io_apic_base(apic);
+        readl(&io_apic->data);
+}
+#define __DO_ACTION(R, ACTION, FINAL)                                   \
+                                                                        \
+{                                                                       \
+        int pin;                                                        \
+        struct irq_pin_list *entry = irq_2_pin + irq;                   \
+                                                                        \
+        BUG_ON(irq >= NR_IRQS);                                         \
+        for (;;) {                                                      \
+                unsigned int reg;                                       \
+                pin = entry->pin;                                       \
+                if (pin == -1)                                          \
+                        break;                                          \
+                reg = io_apic_read(entry->apic, 0x10 + R + pin*2);      \
+                reg ACTION;                                             \
+                io_apic_modify(entry->apic, reg);                       \
+                FINAL;                                                  \
+                if (!entry->next)                                       \
+                        break;                                          \
+                entry = irq_2_pin + entry->next;                        \
+        }                                                               \
+}
+union entry_union {
+        struct { u32 w1, w2; };
+        struct IO_APIC_route_entry entry;
+};
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+        union entry_union eu;
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+        eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return eu.entry;
+}
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void
+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+        union entry_union eu;
+        eu.entry = e;
+        io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+        io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        __ioapic_write_entry(apic, pin, e);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+        unsigned long flags;
+        union entry_union eu = { .entry.mask = 1 };
+        spin_lock_irqsave(&ioapic_lock, flags);
+        io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+        io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+#ifdef CONFIG_SMP
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+        int apic, pin;
+        struct irq_pin_list *entry = irq_2_pin + irq;
+        BUG_ON(irq >= NR_IRQS);
+        for (;;) {
+                unsigned int reg;
+                apic = entry->apic;
+                pin = entry->pin;
+                if (pin == -1)
+                        break;
+                io_apic_write(apic, 0x11 + pin*2, dest);
+                reg = io_apic_read(apic, 0x10 + pin*2);
+                reg &= ~0x000000ff;
+                reg |= vector;
+                io_apic_modify(apic, reg);
+                if (!entry->next)
+                        break;
+                entry = irq_2_pin + entry->next;
+        }
+}
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+        struct irq_cfg *cfg = irq_cfg + irq;
+        unsigned long flags;
+        unsigned int dest;
+        cpumask_t tmp;
+        cpus_and(tmp, mask, cpu_online_map);
+        if (cpus_empty(tmp))
+                return;
+        if (assign_irq_vector(irq, mask))
+                return;
+        cpus_and(tmp, cfg->domain, mask);
+        dest = cpu_mask_to_apicid(tmp);
+        /*
+         * Only the high 8 bits are valid.
+         */
+        dest = SET_APIC_LOGICAL_ID(dest);
+        spin_lock_irqsave(&ioapic_lock, flags);
+        __target_IO_APIC_irq(irq, dest, cfg->vector);
+        irq_desc[irq].affinity = mask;
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+#endif
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+        static int first_free_entry = NR_IRQS;
+        struct irq_pin_list *entry = irq_2_pin + irq;
+        BUG_ON(irq >= NR_IRQS);
+        while (entry->next)
+                entry = irq_2_pin + entry->next;
+        if (entry->pin != -1) {
+                entry->next = first_free_entry;
+                entry = irq_2_pin + entry->next;
+                if (++first_free_entry >= PIN_MAP_SIZE)
+                        panic("io_apic.c: ran out of irq_2_pin entries!");
+        }
+        entry->apic = apic;
+        entry->pin = pin;
+}
+#define DO_ACTION(name,R,ACTION, FINAL)                                 \
+                                                                        \
+        static void name##_IO_APIC_irq (unsigned int irq)               \
+        __DO_ACTION(R, ACTION, FINAL)
+DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
+                                                /* mask = 1 */
+DO_ACTION( __unmask,           0, &= 0xfffeffff, )
+                                                /* mask = 0 */
+static void mask_IO_APIC_irq (unsigned int irq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        __mask_IO_APIC_irq(irq);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+static void unmask_IO_APIC_irq (unsigned int irq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        __unmask_IO_APIC_irq(irq);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+        struct IO_APIC_route_entry entry;
+        /* Check delivery_mode to be sure we're not clearing an SMI pin */
+        entry = ioapic_read_entry(apic, pin);
+        if (entry.delivery_mode == dest_SMI)
+                return;
+        /*
+         * Disable it in the IO-APIC irq-routing table:
+         */
+        ioapic_mask_entry(apic, pin);
+}
+static void clear_IO_APIC (void)
+{
+        int apic, pin;
+        for (apic = 0; apic < nr_ioapics; apic++)
+                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+                        clear_IO_APIC_pin(apic, pin);
+}
+int skip_ioapic_setup;
+int ioapic_force;
+static int __init parse_noapic(char *str)
+{
+        disable_ioapic_setup();
+        return 0;
+}
+early_param("noapic", parse_noapic);
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
+{
+        disable_timer_pin_1 = 1;
+        return 1;
+}
+__setup("disable_timer_pin_1", disable_timer_pin_setup);
+static int __init setup_disable_8254_timer(char *s)
+{
+        timer_over_8254 = -1;
+        return 1;
+}
+static int __init setup_enable_8254_timer(char *s)
+{
+        timer_over_8254 = 2;
+        return 1;
+}
+__setup("disable_8254_timer", setup_disable_8254_timer);
+__setup("enable_8254_timer", setup_enable_8254_timer);
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int apic, int pin, int type)
+{
+        int i;
+        for (i = 0; i < mp_irq_entries; i++)
+                if (mp_irqs[i].mpc_irqtype == type &&
+                    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
+                     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
+                    mp_irqs[i].mpc_dstirq == pin)
+                        return i;
+        return -1;
+}
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+        int i;
+        for (i = 0; i < mp_irq_entries; i++) {
+                int lbus = mp_irqs[i].mpc_srcbus;
+                if (test_bit(lbus, mp_bus_not_pci) &&
+                    (mp_irqs[i].mpc_irqtype == type) &&
+                    (mp_irqs[i].mpc_srcbusirq == irq))
+                        return mp_irqs[i].mpc_dstirq;
+        }
+        return -1;
+}
+static int __init find_isa_irq_apic(int irq, int type)
+{
+        int i;
+        for (i = 0; i < mp_irq_entries; i++) {
+                int lbus = mp_irqs[i].mpc_srcbus;
+                if (test_bit(lbus, mp_bus_not_pci) &&
+                    (mp_irqs[i].mpc_irqtype == type) &&
+                    (mp_irqs[i].mpc_srcbusirq == irq))
+                        break;
+        }
+        if (i < mp_irq_entries) {
+                int apic;
+                for(apic = 0; apic < nr_ioapics; apic++) {
+                        if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+                                return apic;
+                }
+        }
+        return -1;
+}
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+static int pin_2_irq(int idx, int apic, int pin);
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
+{
+        int apic, i, best_guess = -1;
+        apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+                bus, slot, pin);
+        if (mp_bus_id_to_pci_bus[bus] == -1) {
+                apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+                return -1;
+        }
+        for (i = 0; i < mp_irq_entries; i++) {
+                int lbus = mp_irqs[i].mpc_srcbus;
+                for (apic = 0; apic < nr_ioapics; apic++)
+                        if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+                            mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+                                break;
+                if (!test_bit(lbus, mp_bus_not_pci) &&
+                    !mp_irqs[i].mpc_irqtype &&
+                    (bus == lbus) &&
+                    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
+                        int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+                        if (!(apic || IO_APIC_IRQ(irq)))
+                                continue;
+                        if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+                                return irq;
+                        /*
+                         * Use the first all-but-pin matching entry as a
+                         * best-guess fuzzy result for broken mptables.
+                         */
+                        if (best_guess < 0)
+                                best_guess = irq;
+                }
+        }
+        BUG_ON(best_guess >= NR_IRQS);
+        return best_guess;
+}
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+#define default_ISA_trigger(idx)        (0)
+#define default_ISA_polarity(idx)       (0)
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+#define default_PCI_trigger(idx)        (1)
+#define default_PCI_polarity(idx)       (1)
+static int __init MPBIOS_polarity(int idx)
+{
+        int bus = mp_irqs[idx].mpc_srcbus;
+        int polarity;
+        /*
+         * Determine IRQ line polarity (high active or low active):
+         */
+        switch (mp_irqs[idx].mpc_irqflag & 3)
+        {
+                case 0: /* conforms, ie. bus-type dependent polarity */
+                        if (test_bit(bus, mp_bus_not_pci))
+                                polarity = default_ISA_polarity(idx);
+                        else
+                                polarity = default_PCI_polarity(idx);
+                        break;
+                case 1: /* high active */
+                {
+                        polarity = 0;
+                        break;
+                }
+                case 2: /* reserved */
+                {
+                        printk(KERN_WARNING "broken BIOS!!\n");
+                        polarity = 1;
+                        break;
+                }
+                case 3: /* low active */
+                {
+                        polarity = 1;
+                        break;
+                }
+                default: /* invalid */
+                {
+                        printk(KERN_WARNING "broken BIOS!!\n");
+                        polarity = 1;
+                        break;
+                }
+        }
+        return polarity;
+}
+static int MPBIOS_trigger(int idx)
+{
+        int bus = mp_irqs[idx].mpc_srcbus;
+        int trigger;
+        /*
+         * Determine IRQ trigger mode (edge or level sensitive):
+         */
+        switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+        {
+                case 0: /* conforms, ie. bus-type dependent */
+                        if (test_bit(bus, mp_bus_not_pci))
+                                trigger = default_ISA_trigger(idx);
+                        else
+                                trigger = default_PCI_trigger(idx);
+                        break;
+                case 1: /* edge */
+                {
+                        trigger = 0;
+                        break;
+                }
+                case 2: /* reserved */
+                {
+                        printk(KERN_WARNING "broken BIOS!!\n");
+                        trigger = 1;
+                        break;
+                }
+                case 3: /* level */
+                {
+                        trigger = 1;
+                        break;
+                }
+                default: /* invalid */
+                {
+                        printk(KERN_WARNING "broken BIOS!!\n");
+                        trigger = 0;
+                        break;
+                }
+        }
+        return trigger;
+}
+static inline int irq_polarity(int idx)
+{
+        return MPBIOS_polarity(idx);
+}
+static inline int irq_trigger(int idx)
+{
+        return MPBIOS_trigger(idx);
+}
+static int pin_2_irq(int idx, int apic, int pin)
+{
+        int irq, i;
+        int bus = mp_irqs[idx].mpc_srcbus;
+        /*
+         * Debugging check, we are in big trouble if this message pops up!
+         */
+        if (mp_irqs[idx].mpc_dstirq != pin)
+                printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+        if (test_bit(bus, mp_bus_not_pci)) {
+                irq = mp_irqs[idx].mpc_srcbusirq;
+        } else {
+                /*
+                 * PCI IRQs are mapped in order
+                 */
+                i = irq = 0;
+                while (i < apic)
+                        irq += nr_ioapic_registers[i++];
+                irq += pin;
+        }
+        BUG_ON(irq >= NR_IRQS);
+        return irq;
+}
+static int __assign_irq_vector(int irq, cpumask_t mask)
+{
+        /*
+         * NOTE! The local APIC isn't very good at handling
+         * multiple interrupts at the same interrupt level.
+         * As the interrupt level is determined by taking the
+         * vector number and shifting that right by 4, we
+         * want to spread these out a bit so that they don't
+         * all fall in the same interrupt level.
+         *
+         * Also, we've got to be careful not to trash gate
+         * 0x80, because int 0x80 is hm, kind of importantish. ;)
+         */
+        static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+        unsigned int old_vector;
+        int cpu;
+        struct irq_cfg *cfg;
+        BUG_ON((unsigned)irq >= NR_IRQS);
+        cfg = &irq_cfg[irq];
+        /* Only try and allocate irqs on cpus that are present */
+        cpus_and(mask, mask, cpu_online_map);
+        if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+                return -EBUSY;
+        old_vector = cfg->vector;
+        if (old_vector) {
+                cpumask_t tmp;
+                cpus_and(tmp, cfg->domain, mask);
+                if (!cpus_empty(tmp))
+                        return 0;
+        }
+        for_each_cpu_mask(cpu, mask) {
+                cpumask_t domain, new_mask;
+                int new_cpu;
+                int vector, offset;
+                domain = vector_allocation_domain(cpu);
+                cpus_and(new_mask, domain, cpu_online_map);
+                vector = current_vector;
+                offset = current_offset;
+next:
+                vector += 8;
+                if (vector >= FIRST_SYSTEM_VECTOR) {
+                        /* If we run out of vectors on large boxen, must share them. */
+                        offset = (offset + 1) % 8;
+                        vector = FIRST_DEVICE_VECTOR + offset;
+                }
+                if (unlikely(current_vector == vector))
+                        continue;
+                if (vector == IA32_SYSCALL_VECTOR)
+                        goto next;
+                for_each_cpu_mask(new_cpu, new_mask)
+                        if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+                                goto next;
+                /* Found one! */
+                current_vector = vector;
+                current_offset = offset;
+                if (old_vector) {
+                        cfg->move_in_progress = 1;
+                        cfg->old_domain = cfg->domain;
+                }
+                for_each_cpu_mask(new_cpu, new_mask)
+                        per_cpu(vector_irq, new_cpu)[vector] = irq;
+                cfg->vector = vector;
+                cfg->domain = domain;
+                return 0;
+        }
+        return -ENOSPC;
+}
+static int assign_irq_vector(int irq, cpumask_t mask)
+{
+        int err;
+        unsigned long flags;
+        spin_lock_irqsave(&vector_lock, flags);
+        err = __assign_irq_vector(irq, mask);
+        spin_unlock_irqrestore(&vector_lock, flags);
+        return err;
+}
+static void __clear_irq_vector(int irq)
+{
+        struct irq_cfg *cfg;
+        cpumask_t mask;
+        int cpu, vector;
+        BUG_ON((unsigned)irq >= NR_IRQS);
+        cfg = &irq_cfg[irq];
+        BUG_ON(!cfg->vector);
+        vector = cfg->vector;
+        cpus_and(mask, cfg->domain, cpu_online_map);
+        for_each_cpu_mask(cpu, mask)
+                per_cpu(vector_irq, cpu)[vector] = -1;
+        cfg->vector = 0;
+        cfg->domain = CPU_MASK_NONE;
+}
+void __setup_vector_irq(int cpu)
+{
+        /* Initialize vector_irq on a new cpu */
+        /* This function must be called with vector_lock held */
+        int irq, vector;
+        /* Mark the inuse vectors */
+        for (irq = 0; irq < NR_IRQS; ++irq) {
+                if (!cpu_isset(cpu, irq_cfg[irq].domain))
+                        continue;
+                vector = irq_cfg[irq].vector;
+                per_cpu(vector_irq, cpu)[vector] = irq;
+        }
+        /* Mark the free vectors */
+        for (vector = 0; vector < NR_VECTORS; ++vector) {
+                irq = per_cpu(vector_irq, cpu)[vector];
+                if (irq < 0)
+                        continue;
+                if (!cpu_isset(cpu, irq_cfg[irq].domain))
+                        per_cpu(vector_irq, cpu)[vector] = -1;
+        }
+}
+static struct irq_chip ioapic_chip;
+static void ioapic_register_intr(int irq, unsigned long trigger)
+{
+        if (trigger) {
+                irq_desc[irq].status |= IRQ_LEVEL;
+                set_irq_chip_and_handler_name(irq, &ioapic_chip,
+                                              handle_fasteoi_irq, "fasteoi");
+        } else {
+                irq_desc[irq].status &= ~IRQ_LEVEL;
+                set_irq_chip_and_handler_name(irq, &ioapic_chip,
+                                              handle_edge_irq, "edge");
+        }
+}
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+                              int trigger, int polarity)
+{
+        struct irq_cfg *cfg = irq_cfg + irq;
+        struct IO_APIC_route_entry entry;
+        cpumask_t mask;
+        if (!IO_APIC_IRQ(irq))
+                return;
+        mask = TARGET_CPUS;
+        if (assign_irq_vector(irq, mask))
+                return;
+        cpus_and(mask, cfg->domain, mask);
+        apic_printk(APIC_VERBOSE,KERN_DEBUG
+                    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
+                    "IRQ %d Mode:%i Active:%i)\n",
+                    apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
+                    irq, trigger, polarity);
+        /*
+         * add it to the IO-APIC irq-routing table:
+         */
+        memset(&entry,0,sizeof(entry));
+        entry.delivery_mode = INT_DELIVERY_MODE;
+        entry.dest_mode = INT_DEST_MODE;
+        entry.dest = cpu_mask_to_apicid(mask);
+        entry.mask = 0;                         /* enable IRQ */
+        entry.trigger = trigger;
+        entry.polarity = polarity;
+        entry.vector = cfg->vector;
+        /* Mask level triggered irqs.
+         * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+         */
+        if (trigger)
+                entry.mask = 1;
+        ioapic_register_intr(irq, trigger);
+        if (irq < 16)
+                disable_8259A_irq(irq);
+        ioapic_write_entry(apic, pin, entry);
+}
+static void __init setup_IO_APIC_irqs(void)
+{
+        int apic, pin, idx, irq, first_notcon = 1;
+        apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+        for (apic = 0; apic < nr_ioapics; apic++) {
+        for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                idx = find_irq_entry(apic,pin,mp_INT);
+                if (idx == -1) {
+                        if (first_notcon) {
+                                apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+                                first_notcon = 0;
+                        } else
+                                apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+                        continue;
+                }
+                irq = pin_2_irq(idx, apic, pin);
+                add_pin_to_irq(irq, apic, pin);
+                setup_IO_APIC_irq(apic, pin, irq,
+                                  irq_trigger(idx), irq_polarity(idx));
+        }
+        }
+        if (!first_notcon)
+                apic_printk(APIC_VERBOSE," not connected.\n");
+}
+/*
+ * Set up the 8259A-master output pin as broadcast to all
+ * CPUs.
+ */
+static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+{
+        struct IO_APIC_route_entry entry;
+        unsigned long flags;
+        memset(&entry,0,sizeof(entry));
+        disable_8259A_irq(0);
+        /* mask LVT0 */
+        apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+        /*
+         * We use logical delivery to get the timer IRQ
+         * to the first CPU.
+         */
+        entry.dest_mode = INT_DEST_MODE;
+        entry.mask = 0;                                 /* unmask IRQ now */
+        entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
+        entry.delivery_mode = INT_DELIVERY_MODE;
+        entry.polarity = 0;
+        entry.trigger = 0;
+        entry.vector = vector;
+        /*
+         * The timer IRQ doesn't have to know that behind the
+         * scene we have a 8259A-master in AEOI mode ...
+         */
+        set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
+        /*
+         * Add it to the IO-APIC irq-routing table:
+         */
+        spin_lock_irqsave(&ioapic_lock, flags);
+        io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
+        io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        enable_8259A_irq(0);
+}
+void __apicdebuginit print_IO_APIC(void)
+{
+        int apic, i;
+        union IO_APIC_reg_00 reg_00;
+        union IO_APIC_reg_01 reg_01;
+        union IO_APIC_reg_02 reg_02;
+        unsigned long flags;
+        if (apic_verbosity == APIC_QUIET)
+                return;
+        printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+        for (i = 0; i < nr_ioapics; i++)
+                printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+                       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+        /*
+         * We are a bit conservative about what we expect.  We have to
+         * know about every hardware change ASAP.
+         */
+        printk(KERN_INFO "testing the IO APIC.......................\n");
+        for (apic = 0; apic < nr_ioapics; apic++) {
+        spin_lock_irqsave(&ioapic_lock, flags);
+        reg_00.raw = io_apic_read(apic, 0);
+        reg_01.raw = io_apic_read(apic, 1);
+        if (reg_01.bits.version >= 0x10)
+                reg_02.raw = io_apic_read(apic, 2);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        printk("\n");
+        printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+        printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+        printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
+        printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
+        printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
+        printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
+        printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
+        if (reg_01.bits.version >= 0x10) {
+                printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+                printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
+        }
+        printk(KERN_DEBUG ".... IRQ redirection table:\n");
+        printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+                          " Stat Dmod Deli Vect:   \n");
+        for (i = 0; i <= reg_01.bits.entries; i++) {
+                struct IO_APIC_route_entry entry;
+                entry = ioapic_read_entry(apic, i);
+                printk(KERN_DEBUG " %02x %03X ",
+                        i,
+                        entry.dest
+                );
+                printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
+                        entry.mask,
+                        entry.trigger,
+                        entry.irr,
+                        entry.polarity,
+                        entry.delivery_status,
+                        entry.dest_mode,
+                        entry.delivery_mode,
+                        entry.vector
+                );
+        }
+        }
+        printk(KERN_DEBUG "IRQ to pin mappings:\n");
+        for (i = 0; i < NR_IRQS; i++) {
+                struct irq_pin_list *entry = irq_2_pin + i;
+                if (entry->pin < 0)
+                        continue;
+                printk(KERN_DEBUG "IRQ%d ", i);
+                for (;;) {
+                        printk("-> %d:%d", entry->apic, entry->pin);
+                        if (!entry->next)
+                                break;
+                        entry = irq_2_pin + entry->next;
+                }
+                printk("\n");
+        }
+        printk(KERN_INFO ".................................... done.\n");
+        return;
+}
+#if 0
+static __apicdebuginit void print_APIC_bitfield (int base)
+{
+        unsigned int v;
+        int i, j;
+        if (apic_verbosity == APIC_QUIET)
+                return;
+        printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
+        for (i = 0; i < 8; i++) {
+                v = apic_read(base + i*0x10);
+                for (j = 0; j < 32; j++) {
+                        if (v & (1<<j))
+                                printk("1");
+                        else
+                                printk("0");
+                }
+                printk("\n");
+        }
+}
+void __apicdebuginit print_local_APIC(void * dummy)
+{
+        unsigned int v, ver, maxlvt;
+        if (apic_verbosity == APIC_QUIET)
+                return;
+        printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+                smp_processor_id(), hard_smp_processor_id());
+        v = apic_read(APIC_ID);
+        printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
+        v = apic_read(APIC_LVR);
+        printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+        ver = GET_APIC_VERSION(v);
+        maxlvt = get_maxlvt();
+        v = apic_read(APIC_TASKPRI);
+        printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+        v = apic_read(APIC_ARBPRI);
+        printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+                v & APIC_ARBPRI_MASK);
+        v = apic_read(APIC_PROCPRI);
+        printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+        v = apic_read(APIC_EOI);
+        printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+        v = apic_read(APIC_RRR);
+        printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+        v = apic_read(APIC_LDR);
+        printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+        v = apic_read(APIC_DFR);
+        printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+        v = apic_read(APIC_SPIV);
+        printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+        printk(KERN_DEBUG "... APIC ISR field:\n");
+        print_APIC_bitfield(APIC_ISR);
+        printk(KERN_DEBUG "... APIC TMR field:\n");
+        print_APIC_bitfield(APIC_TMR);
+        printk(KERN_DEBUG "... APIC IRR field:\n");
+        print_APIC_bitfield(APIC_IRR);
+        v = apic_read(APIC_ESR);
+        printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+        v = apic_read(APIC_ICR);
+        printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
+        v = apic_read(APIC_ICR2);
+        printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+        v = apic_read(APIC_LVTT);
+        printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+        if (maxlvt > 3) {                       /* PC is LVT#4. */
+                v = apic_read(APIC_LVTPC);
+                printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+        }
+        v = apic_read(APIC_LVT0);
+        printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+        v = apic_read(APIC_LVT1);
+        printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+        if (maxlvt > 2) {                       /* ERR is LVT#3. */
+                v = apic_read(APIC_LVTERR);
+                printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+        }
+        v = apic_read(APIC_TMICT);
+        printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+        v = apic_read(APIC_TMCCT);
+        printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+        v = apic_read(APIC_TDCR);
+        printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+        printk("\n");
+}
+void print_all_local_APICs (void)
+{
+        on_each_cpu(print_local_APIC, NULL, 1, 1);
+}
+void __apicdebuginit print_PIC(void)
+{
+        unsigned int v;
+        unsigned long flags;
+        if (apic_verbosity == APIC_QUIET)
+                return;
+        printk(KERN_DEBUG "\nprinting PIC contents\n");
+        spin_lock_irqsave(&i8259A_lock, flags);
+        v = inb(0xa1) << 8 | inb(0x21);
+        printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
+        v = inb(0xa0) << 8 | inb(0x20);
+        printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
+        outb(0x0b,0xa0);
+        outb(0x0b,0x20);
+        v = inb(0xa0) << 8 | inb(0x20);
+        outb(0x0a,0xa0);
+        outb(0x0a,0x20);
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+        printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
+        v = inb(0x4d1) << 8 | inb(0x4d0);
+        printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+#endif  /*  0  */
+static void __init enable_IO_APIC(void)
+{
+        union IO_APIC_reg_01 reg_01;
+        int i8259_apic, i8259_pin;
+        int i, apic;
+        unsigned long flags;
+        for (i = 0; i < PIN_MAP_SIZE; i++) {
+                irq_2_pin[i].pin = -1;
+                irq_2_pin[i].next = 0;
+        }
+        /*
+         * The number of IO-APIC IRQ registers (== #pins):
+         */
+        for (apic = 0; apic < nr_ioapics; apic++) {
+                spin_lock_irqsave(&ioapic_lock, flags);
+                reg_01.raw = io_apic_read(apic, 1);
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+                nr_ioapic_registers[apic] = reg_01.bits.entries+1;
+        }
+        for(apic = 0; apic < nr_ioapics; apic++) {
+                int pin;
+                /* See if any of the pins is in ExtINT mode */
+                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                        struct IO_APIC_route_entry entry;
+                        entry = ioapic_read_entry(apic, pin);
+                        /* If the interrupt line is enabled and in ExtInt mode
+                         * I have found the pin where the i8259 is connected.
+                         */
+                        if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+                                ioapic_i8259.apic = apic;
+                                ioapic_i8259.pin  = pin;
+                                goto found_i8259;
+                        }
+                }
+        }
+ found_i8259:
+        /* Look to see what if the MP table has reported the ExtINT */
+        i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
+        i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
+        /* Trust the MP table if nothing is setup in the hardware */
+        if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
+                printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+                ioapic_i8259.pin  = i8259_pin;
+                ioapic_i8259.apic = i8259_apic;
+        }
+        /* Complain if the MP table and the hardware disagree */
+        if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
+                (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+        {
+                printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
+        }
+        /*
+         * Do not trust the IO-APIC being empty at bootup
+         */
+        clear_IO_APIC();
+}
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+        /*
+         * Clear the IO-APIC before rebooting:
+         */
+        clear_IO_APIC();
+        /*
+         * If the i8259 is routed through an IOAPIC
+         * Put that IOAPIC in virtual wire mode
+         * so legacy interrupts can be delivered.
+         */
+        if (ioapic_i8259.pin != -1) {
+                struct IO_APIC_route_entry entry;
+                memset(&entry, 0, sizeof(entry));
+                entry.mask            = 0; /* Enabled */
+                entry.trigger         = 0; /* Edge */
+                entry.irr             = 0;
+                entry.polarity        = 0; /* High */
+                entry.delivery_status = 0;
+                entry.dest_mode       = 0; /* Physical */
+                entry.delivery_mode   = dest_ExtINT; /* ExtInt */
+                entry.vector          = 0;
+                entry.dest          = GET_APIC_ID(apic_read(APIC_ID));
+                /*
+                 * Add it to the IO-APIC irq-routing table:
+                 */
+                ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
+        }
+        disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+}
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ *      - timer IRQ defaults to IO-APIC IRQ
+ *      - if this function detects that timer IRQs are defunct, then we fall
+ *        back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+        unsigned long t1 = jiffies;
+        local_irq_enable();
+        /* Let ten ticks pass... */
+        mdelay((10 * 1000) / HZ);
+        /*
+         * Expect a few ticks at least, to be sure some possible
+         * glue logic does not lock up after one or two first
+         * ticks in a non-ExtINT mode.  Also the local APIC
+         * might have cached one ExtINT interrupt.  Finally, at
+         * least one tick may be lost due to delays.
+         */
+        /* jiffies wrap? */
+        if (jiffies - t1 > 4)
+                return 1;
+        return 0;
+}
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+/*
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ */
+static unsigned int startup_ioapic_irq(unsigned int irq)
+{
+        int was_pending = 0;
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        if (irq < 16) {
+                disable_8259A_irq(irq);
+                if (i8259A_irq_pending(irq))
+                        was_pending = 1;
+        }
+        __unmask_IO_APIC_irq(irq);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return was_pending;
+}
+static int ioapic_retrigger_irq(unsigned int irq)
+{
+        struct irq_cfg *cfg = &irq_cfg[irq];
+        cpumask_t mask;
+        unsigned long flags;
+        spin_lock_irqsave(&vector_lock, flags);
+        cpus_clear(mask);
+        cpu_set(first_cpu(cfg->domain), mask);
+        send_IPI_mask(mask, cfg->vector);
+        spin_unlock_irqrestore(&vector_lock, flags);
+        return 1;
+}
+/*
+ * Level and edge triggered IO-APIC interrupts need different handling,
+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
+ * handled with the level-triggered descriptor, but that one has slightly
+ * more overhead. Level-triggered interrupts cannot be handled with the
+ * edge-triggered handler, without risking IRQ storms and other ugly
+ * races.
+ */
+#ifdef CONFIG_SMP
+asmlinkage void smp_irq_move_cleanup_interrupt(void)
+{
+        unsigned vector, me;
+        ack_APIC_irq();
+        exit_idle();
+        irq_enter();
+        me = smp_processor_id();
+        for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+                unsigned int irq;
+                struct irq_desc *desc;
+                struct irq_cfg *cfg;
+                irq = __get_cpu_var(vector_irq)[vector];
+                if (irq >= NR_IRQS)
+                        continue;
+                desc = irq_desc + irq;
+                cfg = irq_cfg + irq;
+                spin_lock(&desc->lock);
+                if (!cfg->move_cleanup_count)
+                        goto unlock;
+                if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+                        goto unlock;
+                __get_cpu_var(vector_irq)[vector] = -1;
+                cfg->move_cleanup_count--;
+unlock:
+                spin_unlock(&desc->lock);
+        }
+        irq_exit();
+}
+static void irq_complete_move(unsigned int irq)
+{
+        struct irq_cfg *cfg = irq_cfg + irq;
+        unsigned vector, me;
+        if (likely(!cfg->move_in_progress))
+                return;
+        vector = ~get_irq_regs()->orig_rax;
+        me = smp_processor_id();
+        if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
+                cpumask_t cleanup_mask;
+                cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+                cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+                send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+                cfg->move_in_progress = 0;
+        }
+}
+#else
+static inline void irq_complete_move(unsigned int irq) {}
+#endif
+static void ack_apic_edge(unsigned int irq)
+{
+        irq_complete_move(irq);
+        move_native_irq(irq);
+        ack_APIC_irq();
+}
+static void ack_apic_level(unsigned int irq)
+{
+        int do_unmask_irq = 0;
+        irq_complete_move(irq);
+#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+        /* If we are moving the irq we need to mask it */
+        if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
+                do_unmask_irq = 1;
+                mask_IO_APIC_irq(irq);
+        }
+#endif
+        /*
+         * We must acknowledge the irq before we move it or the acknowledge will
+         * not propagate properly.
+         */
+        ack_APIC_irq();
+        /* Now we can move and renable the irq */
+        if (unlikely(do_unmask_irq)) {
+                /* Only migrate the irq if the ack has been received.
+                 *
+                 * On rare occasions the broadcast level triggered ack gets
+                 * delayed going to ioapics, and if we reprogram the
+                 * vector while Remote IRR is still set the irq will never
+                 * fire again.
+                 *
+                 * To prevent this scenario we read the Remote IRR bit
+                 * of the ioapic.  This has two effects.
+                 * - On any sane system the read of the ioapic will
+                 *   flush writes (and acks) going to the ioapic from
+                 *   this cpu.
+                 * - We get to see if the ACK has actually been delivered.
+                 *
+                 * Based on failed experiments of reprogramming the
+                 * ioapic entry from outside of irq context starting
+                 * with masking the ioapic entry and then polling until
+                 * Remote IRR was clear before reprogramming the
+                 * ioapic I don't trust the Remote IRR bit to be
+                 * completey accurate.
+                 *
+                 * However there appears to be no other way to plug
+                 * this race, so if the Remote IRR bit is not
+                 * accurate and is causing problems then it is a hardware bug
+                 * and you can go talk to the chipset vendor about it.
+                 */
+                if (!io_apic_level_ack_pending(irq))
+                        move_masked_irq(irq);
+                unmask_IO_APIC_irq(irq);
+        }
+}
+static struct irq_chip ioapic_chip __read_mostly = {
+        .name           = "IO-APIC",
+        .startup        = startup_ioapic_irq,
+        .mask           = mask_IO_APIC_irq,
+        .unmask         = unmask_IO_APIC_irq,
+        .ack            = ack_apic_edge,
+        .eoi            = ack_apic_level,
+#ifdef CONFIG_SMP
+        .set_affinity   = set_ioapic_affinity_irq,
+#endif
+        .retrigger      = ioapic_retrigger_irq,
+};
+static inline void init_IO_APIC_traps(void)
+{
+        int irq;
+        /*
+         * NOTE! The local APIC isn't very good at handling
+         * multiple interrupts at the same interrupt level.
+         * As the interrupt level is determined by taking the
+         * vector number and shifting that right by 4, we
+         * want to spread these out a bit so that they don't
+         * all fall in the same interrupt level.
+         *
+         * Also, we've got to be careful not to trash gate
+         * 0x80, because int 0x80 is hm, kind of importantish. ;)
+         */
+        for (irq = 0; irq < NR_IRQS ; irq++) {
+                int tmp = irq;
+                if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) {
+                        /*
+                         * Hmm.. We don't have an entry for this,
+                         * so default to an old-fashioned 8259
+                         * interrupt if we can..
+                         */
+                        if (irq < 16)
+                                make_8259A_irq(irq);
+                        else
+                                /* Strange. Oh, well.. */
+                                irq_desc[irq].chip = &no_irq_chip;
+                }
+        }
+}
+static void enable_lapic_irq (unsigned int irq)
+{
+        unsigned long v;
+        v = apic_read(APIC_LVT0);
+        apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+static void disable_lapic_irq (unsigned int irq)
+{
+        unsigned long v;
+        v = apic_read(APIC_LVT0);
+        apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+static void ack_lapic_irq (unsigned int irq)
+{
+        ack_APIC_irq();
+}
+static void end_lapic_irq (unsigned int i) { /* nothing */ }
+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
+        .name = "local-APIC",
+        .typename = "local-APIC-edge",
+        .startup = NULL, /* startup_irq() not used for IRQ0 */
+        .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
+        .enable = enable_lapic_irq,
+        .disable = disable_lapic_irq,
+        .ack = ack_lapic_irq,
+        .end = end_lapic_irq,
+};
+static void setup_nmi (void)
+{
+        /*
+         * Dirty trick to enable the NMI watchdog ...
+         * We put the 8259A master into AEOI mode and
+         * unmask on all local APICs LVT0 as NMI.
+         *
+         * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
+         * is from Maciej W. Rozycki - so we do not have to EOI from
+         * the NMI handler or the timer interrupt.
+         */ 
+        printk(KERN_INFO "activating NMI Watchdog ...");
+        enable_NMI_through_LVT0(NULL);
+        printk(" done.\n");
+}
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic.  ICR does
+ * not support the ExtINT mode, unfortunately.  We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA.  --macro
+ */
+static inline void unlock_ExtINT_logic(void)
+{
+        int apic, pin, i;
+        struct IO_APIC_route_entry entry0, entry1;
+        unsigned char save_control, save_freq_select;
+        unsigned long flags;
+        pin  = find_isa_irq_pin(8, mp_INT);
+        apic = find_isa_irq_apic(8, mp_INT);
+        if (pin == -1)
+                return;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
+        *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        clear_IO_APIC_pin(apic, pin);
+        memset(&entry1, 0, sizeof(entry1));
+        entry1.dest_mode = 0;                   /* physical delivery */
+        entry1.mask = 0;                        /* unmask IRQ now */
+        entry1.dest = hard_smp_processor_id();
+        entry1.delivery_mode = dest_ExtINT;
+        entry1.polarity = entry0.polarity;
+        entry1.trigger = 0;
+        entry1.vector = 0;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
+        io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        save_control = CMOS_READ(RTC_CONTROL);
+        save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+        CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+                   RTC_FREQ_SELECT);
+        CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+        i = 100;
+        while (i-- > 0) {
+                mdelay(10);
+                if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+                        i -= 10;
+        }
+        CMOS_WRITE(save_control, RTC_CONTROL);
+        CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+        clear_IO_APIC_pin(apic, pin);
+        spin_lock_irqsave(&ioapic_lock, flags);
+        io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
+        io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
+ * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ *
+ * FIXME: really need to revamp this for modern platforms only.
+ */
+static inline void check_timer(void)
+{
+        struct irq_cfg *cfg = irq_cfg + 0;
+        int apic1, pin1, apic2, pin2;
+        /*
+         * get/set the timer IRQ vector:
+         */
+        disable_8259A_irq(0);
+        assign_irq_vector(0, TARGET_CPUS);
+        /*
+         * Subtle, code in do_timer_interrupt() expects an AEOI
+         * mode for the 8259A whenever interrupts are routed
+         * through I/O APICs.  Also IRQ0 has to be enabled in
+         * the 8259A which implies the virtual wire has to be
+         * disabled in the local APIC.
+         */
+        apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+        init_8259A(1);
+        if (timer_over_8254 > 0)
+                enable_8259A_irq(0);
+        pin1  = find_isa_irq_pin(0, mp_INT);
+        apic1 = find_isa_irq_apic(0, mp_INT);
+        pin2  = ioapic_i8259.pin;
+        apic2 = ioapic_i8259.apic;
+        apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
+                cfg->vector, apic1, pin1, apic2, pin2);
+        if (pin1 != -1) {
+                /*
+                 * Ok, does IRQ0 through the IOAPIC work?
+                 */
+                unmask_IO_APIC_irq(0);
+                if (!no_timer_check && timer_irq_works()) {
+                        nmi_watchdog_default();
+                        if (nmi_watchdog == NMI_IO_APIC) {
+                                disable_8259A_irq(0);
+                                setup_nmi();
+                                enable_8259A_irq(0);
+                        }
+                        if (disable_timer_pin_1 > 0)
+                                clear_IO_APIC_pin(0, pin1);
+                        return;
+                }
+                clear_IO_APIC_pin(apic1, pin1);
+                apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
+                                "connected to IO-APIC\n");
+        }
+        apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
+                                "through the 8259A ... ");
+        if (pin2 != -1) {
+                apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
+                        apic2, pin2);
+                /*
+                 * legacy devices should be connected to IO APIC #0
+                 */
+                setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
+                if (timer_irq_works()) {
+                        apic_printk(APIC_VERBOSE," works.\n");
+                        nmi_watchdog_default();
+                        if (nmi_watchdog == NMI_IO_APIC) {
+                                setup_nmi();
+                        }
+                        return;
+                }
+                /*
+                 * Cleanup, just in case ...
+                 */
+                clear_IO_APIC_pin(apic2, pin2);
+        }
+        apic_printk(APIC_VERBOSE," failed.\n");
+        if (nmi_watchdog == NMI_IO_APIC) {
+                printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+                nmi_watchdog = 0;
+        }
+        apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+        disable_8259A_irq(0);
+        irq_desc[0].chip = &lapic_irq_type;
+        apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
+        enable_8259A_irq(0);
+        if (timer_irq_works()) {
+                apic_printk(APIC_VERBOSE," works.\n");
+                return;
+        }
+        apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
+        apic_printk(APIC_VERBOSE," failed.\n");
+        apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+        init_8259A(0);
+        make_8259A_irq(0);
+        apic_write(APIC_LVT0, APIC_DM_EXTINT);
+        unlock_ExtINT_logic();
+        if (timer_irq_works()) {
+                apic_printk(APIC_VERBOSE," works.\n");
+                return;
+        }
+        apic_printk(APIC_VERBOSE," failed :(.\n");
+        panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
+}
+static int __init notimercheck(char *s)
+{
+        no_timer_check = 1;
+        return 1;
+}
+__setup("no_timer_check", notimercheck);
+/*
+ *
+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
+ *   Linux doesn't really care, as it's not actually used
+ *   for any interrupt handling anyway.
+ */
+#define PIC_IRQS        (1<<2)
+void __init setup_IO_APIC(void)
+{
+        enable_IO_APIC();
+        if (acpi_ioapic)
+                io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
+        else
+                io_apic_irqs = ~PIC_IRQS;
+        apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+        sync_Arb_IDs();
+        setup_IO_APIC_irqs();
+        init_IO_APIC_traps();
+        check_timer();
+        if (!acpi_ioapic)
+                print_IO_APIC();
+}
+struct sysfs_ioapic_data {
+        struct sys_device dev;
+        struct IO_APIC_route_entry entry[0];
+};
+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+        struct IO_APIC_route_entry *entry;
+        struct sysfs_ioapic_data *data;
+        int i;
+        data = container_of(dev, struct sysfs_ioapic_data, dev);
+        entry = data->entry;
+        for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
+                *entry = ioapic_read_entry(dev->id, i);
+        return 0;
+}
+static int ioapic_resume(struct sys_device *dev)
+{
+        struct IO_APIC_route_entry *entry;
+        struct sysfs_ioapic_data *data;
+        unsigned long flags;
+        union IO_APIC_reg_00 reg_00;
+        int i;
+        data = container_of(dev, struct sysfs_ioapic_data, dev);
+        entry = data->entry;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        reg_00.raw = io_apic_read(dev->id, 0);
+        if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
+                reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+                io_apic_write(dev->id, 0, reg_00.raw);
+        }
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
+                ioapic_write_entry(dev->id, i, entry[i]);
+        return 0;
+}
+static struct sysdev_class ioapic_sysdev_class = {
+        set_kset_name("ioapic"),
+        .suspend = ioapic_suspend,
+        .resume = ioapic_resume,
+};
+static int __init ioapic_init_sysfs(void)
+{
+        struct sys_device * dev;
+        int i, size, error = 0;
+        error = sysdev_class_register(&ioapic_sysdev_class);
+        if (error)
+                return error;
+        for (i = 0; i < nr_ioapics; i++ ) {
+                size = sizeof(struct sys_device) + nr_ioapic_registers[i]
+                        * sizeof(struct IO_APIC_route_entry);
+                mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+                if (!mp_ioapic_data[i]) {
+                        printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+                        continue;
+                }
+                memset(mp_ioapic_data[i], 0, size);
+                dev = &mp_ioapic_data[i]->dev;
+                dev->id = i;
+                dev->cls = &ioapic_sysdev_class;
+                error = sysdev_register(dev);
+                if (error) {
+                        kfree(mp_ioapic_data[i]);
+                        mp_ioapic_data[i] = NULL;
+                        printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+                        continue;
+                }
+        }
+        return 0;
+}
+device_initcall(ioapic_init_sysfs);
+/*
+ * Dynamic irq allocate and deallocation
+ */
+int create_irq(void)
+{
+        /* Allocate an unused irq */
+        int irq;
+        int new;
+        unsigned long flags;
+        irq = -ENOSPC;
+        spin_lock_irqsave(&vector_lock, flags);
+        for (new = (NR_IRQS - 1); new >= 0; new--) {
+                if (platform_legacy_irq(new))
+                        continue;
+                if (irq_cfg[new].vector != 0)
+                        continue;
+                if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+                        irq = new;
+                break;
+        }
+        spin_unlock_irqrestore(&vector_lock, flags);
+        if (irq >= 0) {
+                dynamic_irq_init(irq);
+        }
+        return irq;
+}
+void destroy_irq(unsigned int irq)
+{
+        unsigned long flags;
+        dynamic_irq_cleanup(irq);
+        spin_lock_irqsave(&vector_lock, flags);
+        __clear_irq_vector(irq);
+        spin_unlock_irqrestore(&vector_lock, flags);
+}
+/*
+ * MSI mesage composition
+ */
+#ifdef CONFIG_PCI_MSI
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+        struct irq_cfg *cfg = irq_cfg + irq;
+        int err;
+        unsigned dest;
+        cpumask_t tmp;
+        tmp = TARGET_CPUS;
+        err = assign_irq_vector(irq, tmp);
+        if (!err) {
+                cpus_and(tmp, cfg->domain, tmp);
+                dest = cpu_mask_to_apicid(tmp);
+                msg->address_hi = MSI_ADDR_BASE_HI;
+                msg->address_lo =
+                        MSI_ADDR_BASE_LO |
+                        ((INT_DEST_MODE == 0) ?
+                                MSI_ADDR_DEST_MODE_PHYSICAL:
+                                MSI_ADDR_DEST_MODE_LOGICAL) |
+                        ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+                                MSI_ADDR_REDIRECTION_CPU:
+                                MSI_ADDR_REDIRECTION_LOWPRI) |
+                        MSI_ADDR_DEST_ID(dest);
+                msg->data =
+                        MSI_DATA_TRIGGER_EDGE |
+                        MSI_DATA_LEVEL_ASSERT |
+                        ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+                                MSI_DATA_DELIVERY_FIXED:
+                                MSI_DATA_DELIVERY_LOWPRI) |
+                        MSI_DATA_VECTOR(cfg->vector);
+        }
+        return err;
+}
+#ifdef CONFIG_SMP
+static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+        struct irq_cfg *cfg = irq_cfg + irq;
+        struct msi_msg msg;
+        unsigned int dest;
+        cpumask_t tmp;
+        cpus_and(tmp, mask, cpu_online_map);
+        if (cpus_empty(tmp))
+                return;
+        if (assign_irq_vector(irq, mask))
+                return;
+        cpus_and(tmp, cfg->domain, mask);
+        dest = cpu_mask_to_apicid(tmp);
+        read_msi_msg(irq, &msg);
+        msg.data &= ~MSI_DATA_VECTOR_MASK;
+        msg.data |= MSI_DATA_VECTOR(cfg->vector);
+        msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+        write_msi_msg(irq, &msg);
+        irq_desc[irq].affinity = mask;
+}
+#endif /* CONFIG_SMP */
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip msi_chip = {
+        .name           = "PCI-MSI",
+        .unmask         = unmask_msi_irq,
+        .mask           = mask_msi_irq,
+        .ack            = ack_apic_edge,
+#ifdef CONFIG_SMP
+        .set_affinity   = set_msi_irq_affinity,
+#endif
+        .retrigger      = ioapic_retrigger_irq,
+};
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
+        struct msi_msg msg;
+        int irq, ret;
+        irq = create_irq();
+        if (irq < 0)
+                return irq;
+        ret = msi_compose_msg(dev, irq, &msg);
+        if (ret < 0) {
+                destroy_irq(irq);
+                return ret;
+        }
+        set_irq_msi(irq, desc);
+        write_msi_msg(irq, &msg);
+        set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+        return 0;
+}
+void arch_teardown_msi_irq(unsigned int irq)
+{
+        destroy_irq(irq);
+}
+#endif /* CONFIG_PCI_MSI */
+/*
+ * Hypertransport interrupt support
+ */
+#ifdef CONFIG_HT_IRQ
+#ifdef CONFIG_SMP
+static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+        struct ht_irq_msg msg;
+        fetch_ht_irq_msg(irq, &msg);
+        msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
+        msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+        msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
+        msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+        write_ht_irq_msg(irq, &msg);
+}
+static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+        struct irq_cfg *cfg = irq_cfg + irq;
+        unsigned int dest;
+        cpumask_t tmp;
+        cpus_and(tmp, mask, cpu_online_map);
+        if (cpus_empty(tmp))
+                return;
+        if (assign_irq_vector(irq, mask))
+                return;
+        cpus_and(tmp, cfg->domain, mask);
+        dest = cpu_mask_to_apicid(tmp);
+        target_ht_irq(irq, dest, cfg->vector);
+        irq_desc[irq].affinity = mask;
+}
+#endif
+static struct irq_chip ht_irq_chip = {
+        .name           = "PCI-HT",
+        .mask           = mask_ht_irq,
+        .unmask         = unmask_ht_irq,
+        .ack            = ack_apic_edge,
+#ifdef CONFIG_SMP
+        .set_affinity   = set_ht_irq_affinity,
+#endif
+        .retrigger      = ioapic_retrigger_irq,
+};
+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+{
+        struct irq_cfg *cfg = irq_cfg + irq;
+        int err;
+        cpumask_t tmp;
+        tmp = TARGET_CPUS;
+        err = assign_irq_vector(irq, tmp);
+        if (!err) {
+                struct ht_irq_msg msg;
+                unsigned dest;
+                cpus_and(tmp, cfg->domain, tmp);
+                dest = cpu_mask_to_apicid(tmp);
+                msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+                msg.address_lo =
+                        HT_IRQ_LOW_BASE |
+                        HT_IRQ_LOW_DEST_ID(dest) |
+                        HT_IRQ_LOW_VECTOR(cfg->vector) |
+                        ((INT_DEST_MODE == 0) ?
+                                HT_IRQ_LOW_DM_PHYSICAL :
+                                HT_IRQ_LOW_DM_LOGICAL) |
+                        HT_IRQ_LOW_RQEOI_EDGE |
+                        ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+                                HT_IRQ_LOW_MT_FIXED :
+                                HT_IRQ_LOW_MT_ARBITRATED) |
+                        HT_IRQ_LOW_IRQ_MASKED;
+                write_ht_irq_msg(irq, &msg);
+                set_irq_chip_and_handler_name(irq, &ht_irq_chip,
+                                              handle_edge_irq, "edge");
+        }
+        return err;
+}
+#endif /* CONFIG_HT_IRQ */
+/* --------------------------------------------------------------------------
+                          ACPI-based IOAPIC Configuration
+   -------------------------------------------------------------------------- */
+#ifdef CONFIG_ACPI
+#define IO_APIC_MAX_ID          0xFE
+int __init io_apic_get_redir_entries (int ioapic)
+{
+        union IO_APIC_reg_01    reg_01;
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        reg_01.raw = io_apic_read(ioapic, 1);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return reg_01.bits.entries;
+}
+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
+{
+        if (!IO_APIC_IRQ(irq)) {
+                apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+                        ioapic);
+                return -EINVAL;
+        }
+        /*
+         * IRQs < 16 are already in the irq_2_pin[] map
+         */
+        if (irq >= 16)
+                add_pin_to_irq(irq, ioapic, pin);
+        setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+        return 0;
+}
+#endif /* CONFIG_ACPI */
+/*
+ * This function currently is only a helper for the i386 smp boot process where
+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
+ * so mask in all cases should simply be TARGET_CPUS
+ */
+#ifdef CONFIG_SMP
+void __init setup_ioapic_dest(void)
+{
+        int pin, ioapic, irq, irq_entry;
+        if (skip_ioapic_setup == 1)
+                return;
+        for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
+                for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+                        irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+                        if (irq_entry == -1)
+                                continue;
+                        irq = pin_2_irq(irq_entry, ioapic, pin);
+                        /* setup_IO_APIC_irqs could fail to get vector for some device
+                         * when you have too many devices, because at that time only boot
+                         * cpu is online.
+                         */
+                        if (!irq_cfg[irq].vector)
+                                setup_IO_APIC_irq(ioapic, pin, irq,
+                                                  irq_trigger(irq_entry),
+                                                  irq_polarity(irq_entry));
+                        else
+                                set_ioapic_affinity_irq(irq, TARGET_CPUS);
+                }
+        }
+}
+#endif
diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport_32.c
new file mode 100644
index 000000000000..3d310a946d76
--- /dev/null
+++ b/arch/x86/kernel/ioport_32.c
@@ -0,0 +1,153 @@
+/*
+ *      linux/arch/i386/kernel/ioport.c
+ *
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus.
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/syscalls.h>
+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
+{
+        unsigned long mask;
+        unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
+        unsigned int low_index = base & (BITS_PER_LONG-1);
+        int length = low_index + extent;
+        if (low_index != 0) {
+                mask = (~0UL << low_index);
+                if (length < BITS_PER_LONG)
+                        mask &= ~(~0UL << length);
+                if (new_value)
+                        *bitmap_base++ |= mask;
+                else
+                        *bitmap_base++ &= ~mask;
+                length -= BITS_PER_LONG;
+        }
+        mask = (new_value ? ~0UL : 0UL);
+        while (length >= BITS_PER_LONG) {
+                *bitmap_base++ = mask;
+                length -= BITS_PER_LONG;
+        }
+        if (length > 0) {
+                mask = ~(~0UL << length);
+                if (new_value)
+                        *bitmap_base++ |= mask;
+                else
+                        *bitmap_base++ &= ~mask;
+        }
+}
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+        unsigned long i, max_long, bytes, bytes_updated;
+        struct thread_struct * t = &current->thread;
+        struct tss_struct * tss;
+        unsigned long *bitmap;
+        if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+                return -EINVAL;
+        if (turn_on && !capable(CAP_SYS_RAWIO))
+                return -EPERM;
+        /*
+         * If it's the first ioperm() call in this thread's lifetime, set the
+         * IO bitmap up. ioperm() is much less timing critical than clone(),
+         * this is why we delay this operation until now:
+         */
+        if (!t->io_bitmap_ptr) {
+                bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+                if (!bitmap)
+                        return -ENOMEM;
+                memset(bitmap, 0xff, IO_BITMAP_BYTES);
+                t->io_bitmap_ptr = bitmap;
+                set_thread_flag(TIF_IO_BITMAP);
+        }
+        /*
+         * do it in the per-thread copy and in the TSS ...
+         *
+         * Disable preemption via get_cpu() - we must not switch away
+         * because the ->io_bitmap_max value must match the bitmap
+         * contents:
+         */
+        tss = &per_cpu(init_tss, get_cpu());
+        set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+        /*
+         * Search for a (possibly new) maximum. This is simple and stupid,
+         * to keep it obviously correct:
+         */
+        max_long = 0;
+        for (i = 0; i < IO_BITMAP_LONGS; i++)
+                if (t->io_bitmap_ptr[i] != ~0UL)
+                        max_long = i;
+        bytes = (max_long + 1) * sizeof(long);
+        bytes_updated = max(bytes, t->io_bitmap_max);
+        t->io_bitmap_max = bytes;
+        /*
+         * Sets the lazy trigger so that the next I/O operation will
+         * reload the correct bitmap.
+         * Reset the owner so that a process switch will not set
+         * tss->io_bitmap_base to IO_BITMAP_OFFSET.
+         */
+        tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
+        tss->io_bitmap_owner = NULL;
+        put_cpu();
+        return 0;
+}
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ *
+ * Here we just change the eflags value on the stack: we allow
+ * only the super-user to do it. This depends on the stack-layout
+ * on system-call entry - see also fork() and the signal handling
+ * code.
+ */
+asmlinkage long sys_iopl(unsigned long unused)
+{
+        volatile struct pt_regs * regs = (struct pt_regs *) &unused;
+        unsigned int level = regs->ebx;
+        unsigned int old = (regs->eflags >> 12) & 3;
+        struct thread_struct *t = &current->thread;
+        if (level > 3)
+                return -EINVAL;
+        /* Trying to gain more privileges? */
+        if (level > old) {
+                if (!capable(CAP_SYS_RAWIO))
+                        return -EPERM;
+        }
+        t->iopl = level << 12;
+        regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
+        set_iopl_mask(t->iopl);
+        return 0;
+}
diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c
new file mode 100644
index 000000000000..653efa30b0f4
--- /dev/null
+++ b/arch/x86/kernel/ioport_64.c
@@ -0,0 +1,119 @@
+/*
+ *      linux/arch/x86_64/kernel/ioport.c
+ *
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus.
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/syscalls.h>
+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
+{
+        int i;
+                if (new_value)
+                for (i = base; i < base + extent; i++) 
+                        __set_bit(i, bitmap); 
+                else
+                for (i = base; i < base + extent; i++) 
+                        clear_bit(i, bitmap); 
+}
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+        unsigned int i, max_long, bytes, bytes_updated;
+        struct thread_struct * t = &current->thread;
+        struct tss_struct * tss;
+        unsigned long *bitmap;
+        if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+                return -EINVAL;
+        if (turn_on && !capable(CAP_SYS_RAWIO))
+                return -EPERM;
+        /*
+         * If it's the first ioperm() call in this thread's lifetime, set the
+         * IO bitmap up. ioperm() is much less timing critical than clone(),
+         * this is why we delay this operation until now:
+         */
+        if (!t->io_bitmap_ptr) {
+                bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+                if (!bitmap)
+                        return -ENOMEM;
+                memset(bitmap, 0xff, IO_BITMAP_BYTES);
+                t->io_bitmap_ptr = bitmap;
+                set_thread_flag(TIF_IO_BITMAP);
+        }
+        /*
+         * do it in the per-thread copy and in the TSS ...
+         *
+         * Disable preemption via get_cpu() - we must not switch away
+         * because the ->io_bitmap_max value must match the bitmap
+         * contents:
+         */
+        tss = &per_cpu(init_tss, get_cpu());
+        set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+        /*
+         * Search for a (possibly new) maximum. This is simple and stupid,
+         * to keep it obviously correct:
+         */
+        max_long = 0;
+        for (i = 0; i < IO_BITMAP_LONGS; i++)
+                if (t->io_bitmap_ptr[i] != ~0UL)
+                        max_long = i;
+        bytes = (max_long + 1) * sizeof(long);
+        bytes_updated = max(bytes, t->io_bitmap_max);
+        t->io_bitmap_max = bytes;
+        /* Update the TSS: */
+        memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+        put_cpu();
+        return 0;
+}
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ *
+ * Here we just change the eflags value on the stack: we allow
+ * only the super-user to do it. This depends on the stack-layout
+ * on system-call entry - see also fork() and the signal handling
+ * code.
+ */
+asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+{
+        unsigned int old = (regs->eflags >> 12) & 3;
+        if (level > 3)
+                return -EINVAL;
+        /* Trying to gain more privileges? */
+        if (level > old) {
+                if (!capable(CAP_SYS_RAWIO))
+                        return -EPERM;
+        }
+        regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
+        return 0;
+}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
new file mode 100644
index 000000000000..4f681bcdb1fc
--- /dev/null
+++ b/arch/x86/kernel/irq_32.c
@@ -0,0 +1,341 @@
+/*
+ *      linux/arch/i386/kernel/irq.c
+ *
+ *      Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86-specific interrupt
+ * entry, irq-stacks and irq statistics code. All the remaining
+ * irq logic is done by the generic kernel/irq/ code and
+ * by the x86-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <asm/apic.h>
+#include <asm/uaccess.h>
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+        printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+#ifdef CONFIG_X86_LOCAL_APIC
+        /*
+         * Currently unexpected vectors happen only on SMP and APIC.
+         * We _must_ ack these because every local APIC has only N
+         * irq slots per priority level, and a 'hanging, unacked' IRQ
+         * holds up an irq slot - in excessive cases (when multiple
+         * unexpected vectors occur) that might lock up the APIC
+         * completely.
+         * But only ack when the APIC is enabled -AK
+         */
+        if (cpu_has_apic)
+                ack_APIC_irq();
+#endif
+}
+#ifdef CONFIG_4KSTACKS
+/*
+ * per-CPU IRQ handling contexts (thread information and stack)
+ */
+union irq_ctx {
+        struct thread_info      tinfo;
+        u32                     stack[THREAD_SIZE/sizeof(u32)];
+};
+static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
+static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
+#endif
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+fastcall unsigned int do_IRQ(struct pt_regs *regs)
+{       
+        struct pt_regs *old_regs;
+        /* high bit used in ret_from_ code */
+        int irq = ~regs->orig_eax;
+        struct irq_desc *desc = irq_desc + irq;
+#ifdef CONFIG_4KSTACKS
+        union irq_ctx *curctx, *irqctx;
+        u32 *isp;
+#endif
+        if (unlikely((unsigned)irq >= NR_IRQS)) {
+                printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
+                                        __FUNCTION__, irq);
+                BUG();
+        }
+        old_regs = set_irq_regs(regs);
+        irq_enter();
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+        /* Debugging check for stack overflow: is there less than 1KB free? */
+        {
+                long esp;
+                __asm__ __volatile__("andl %%esp,%0" :
+                                        "=r" (esp) : "0" (THREAD_SIZE - 1));
+                if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
+                        printk("do_IRQ: stack overflow: %ld\n",
+                                esp - sizeof(struct thread_info));
+                        dump_stack();
+                }
+        }
+#endif
+#ifdef CONFIG_4KSTACKS
+        curctx = (union irq_ctx *) current_thread_info();
+        irqctx = hardirq_ctx[smp_processor_id()];
+        /*
+         * this is where we switch to the IRQ stack. However, if we are
+         * already using the IRQ stack (because we interrupted a hardirq
+         * handler) we can't do that and just have to keep using the
+         * current stack (which is the irq stack already after all)
+         */
+        if (curctx != irqctx) {
+                int arg1, arg2, ebx;
+                /* build the stack frame on the IRQ stack */
+                isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+                irqctx->tinfo.task = curctx->tinfo.task;
+                irqctx->tinfo.previous_esp = current_stack_pointer;
+                /*
+                 * Copy the softirq bits in preempt_count so that the
+                 * softirq checks work in the hardirq context.
+                 */
+                irqctx->tinfo.preempt_count =
+                        (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
+                        (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+                asm volatile(
+                        "       xchgl  %%ebx,%%esp      \n"
+                        "       call   *%%edi           \n"
+                        "       movl   %%ebx,%%esp      \n"
+                        : "=a" (arg1), "=d" (arg2), "=b" (ebx)
+                        :  "0" (irq),   "1" (desc),  "2" (isp),
+                           "D" (desc->handle_irq)
+                        : "memory", "cc"
+                );
+        } else
+#endif
+                desc->handle_irq(irq, desc);
+        irq_exit();
+        set_irq_regs(old_regs);
+        return 1;
+}
+#ifdef CONFIG_4KSTACKS
+static char softirq_stack[NR_CPUS * THREAD_SIZE]
+                __attribute__((__section__(".bss.page_aligned")));
+static char hardirq_stack[NR_CPUS * THREAD_SIZE]
+                __attribute__((__section__(".bss.page_aligned")));
+/*
+ * allocate per-cpu stacks for hardirq and for softirq processing
+ */
+void irq_ctx_init(int cpu)
+{
+        union irq_ctx *irqctx;
+        if (hardirq_ctx[cpu])
+                return;
+        irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+        irqctx->tinfo.task              = NULL;
+        irqctx->tinfo.exec_domain       = NULL;
+        irqctx->tinfo.cpu               = cpu;
+        irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
+        irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+        hardirq_ctx[cpu] = irqctx;
+        irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
+        irqctx->tinfo.task              = NULL;
+        irqctx->tinfo.exec_domain       = NULL;
+        irqctx->tinfo.cpu               = cpu;
+        irqctx->tinfo.preempt_count     = 0;
+        irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+        softirq_ctx[cpu] = irqctx;
+        printk("CPU %u irqstacks, hard=%p soft=%p\n",
+                cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+}
+void irq_ctx_exit(int cpu)
+{
+        hardirq_ctx[cpu] = NULL;
+}
+extern asmlinkage void __do_softirq(void);
+asmlinkage void do_softirq(void)
+{
+        unsigned long flags;
+        struct thread_info *curctx;
+        union irq_ctx *irqctx;
+        u32 *isp;
+        if (in_interrupt())
+                return;
+        local_irq_save(flags);
+        if (local_softirq_pending()) {
+                curctx = current_thread_info();
+                irqctx = softirq_ctx[smp_processor_id()];
+                irqctx->tinfo.task = curctx->task;
+                irqctx->tinfo.previous_esp = current_stack_pointer;
+                /* build the stack frame on the softirq stack */
+                isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+                asm volatile(
+                        "       xchgl   %%ebx,%%esp     \n"
+                        "       call    __do_softirq    \n"
+                        "       movl    %%ebx,%%esp     \n"
+                        : "=b"(isp)
+                        : "0"(isp)
+                        : "memory", "cc", "edx", "ecx", "eax"
+                );
+                /*
+                 * Shouldnt happen, we returned above if in_interrupt():
+                 */
+                WARN_ON_ONCE(softirq_count());
+        }
+        local_irq_restore(flags);
+}
+#endif
+/*
+ * Interrupt statistics:
+ */
+atomic_t irq_err_count;
+/*
+ * /proc/interrupts printing:
+ */
+int show_interrupts(struct seq_file *p, void *v)
+{
+        int i = *(loff_t *) v, j;
+        struct irqaction * action;
+        unsigned long flags;
+        if (i == 0) {
+                seq_printf(p, "           ");
+                for_each_online_cpu(j)
+                        seq_printf(p, "CPU%-8d",j);
+                seq_putc(p, '\n');
+        }
+        if (i < NR_IRQS) {
+                spin_lock_irqsave(&irq_desc[i].lock, flags);
+                action = irq_desc[i].action;
+                if (!action)
+                        goto skip;
+                seq_printf(p, "%3d: ",i);
+#ifndef CONFIG_SMP
+                seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+                for_each_online_cpu(j)
+                        seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+#endif
+                seq_printf(p, " %8s", irq_desc[i].chip->name);
+                seq_printf(p, "-%-8s", irq_desc[i].name);
+                seq_printf(p, "  %s", action->name);
+                for (action=action->next; action; action = action->next)
+                        seq_printf(p, ", %s", action->name);
+                seq_putc(p, '\n');
+skip:
+                spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+        } else if (i == NR_IRQS) {
+                seq_printf(p, "NMI: ");
+                for_each_online_cpu(j)
+                        seq_printf(p, "%10u ", nmi_count(j));
+                seq_putc(p, '\n');
+#ifdef CONFIG_X86_LOCAL_APIC
+                seq_printf(p, "LOC: ");
+                for_each_online_cpu(j)
+                        seq_printf(p, "%10u ",
+                                per_cpu(irq_stat,j).apic_timer_irqs);
+                seq_putc(p, '\n');
+#endif
+                seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+                seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+        }
+        return 0;
+}
+#ifdef CONFIG_HOTPLUG_CPU
+#include <mach_apic.h>
+void fixup_irqs(cpumask_t map)
+{
+        unsigned int irq;
+        static int warned;
+        for (irq = 0; irq < NR_IRQS; irq++) {
+                cpumask_t mask;
+                if (irq == 2)
+                        continue;
+                cpus_and(mask, irq_desc[irq].affinity, map);
+                if (any_online_cpu(mask) == NR_CPUS) {
+                        printk("Breaking affinity for irq %i\n", irq);
+                        mask = map;
+                }
+                if (irq_desc[irq].chip->set_affinity)
+                        irq_desc[irq].chip->set_affinity(irq, mask);
+                else if (irq_desc[irq].action && !(warned++))
+                        printk("Cannot set affinity for irq %i\n", irq);
+        }
+#if 0
+        barrier();
+        /* Ingo Molnar says: "after the IO-APIC masks have been redirected
+           [note the nop - the interrupt-enable boundary on x86 is two
+           instructions from sti] - to flush out pending hardirqs and
+           IPIs. After this point nothing is supposed to reach this CPU." */
+        __asm__ __volatile__("sti; nop; cli");
+        barrier();
+#else
+        /* That doesn't seem sufficient.  Give it 1ms. */
+        local_irq_enable();
+        mdelay(1);
+        local_irq_disable();
+#endif
+}
+#endif
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
new file mode 100644
index 000000000000..bd11e42b22bf
--- /dev/null
+++ b/arch/x86/kernel/irq_64.c
@@ -0,0 +1,212 @@
+/*
+ *      linux/arch/x86_64/kernel/irq.c
+ *
+ *      Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86_64-specific interrupt
+ * entry and irq statistics code. All the remaining irq logic is
+ * done by the generic kernel/irq/ code and in the
+ * x86_64-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+#include <asm/io_apic.h>
+#include <asm/idle.h>
+#include <asm/smp.h>
+atomic_t irq_err_count;
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+/*
+ * Probabilistic stack overflow check:
+ *
+ * Only check the stack in process context, because everything else
+ * runs on the big interrupt stacks. Checking reliably is too expensive,
+ * so we just check from interrupts.
+ */
+static inline void stack_overflow_check(struct pt_regs *regs)
+{
+        u64 curbase = (u64)task_stack_page(current);
+        static unsigned long warned = -60*HZ;
+        if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
+            regs->rsp <  curbase + sizeof(struct thread_info) + 128 &&
+            time_after(jiffies, warned + 60*HZ)) {
+                printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
+                       current->comm, curbase, regs->rsp);
+                show_stack(NULL,NULL);
+                warned = jiffies;
+        }
+}
+#endif
+/*
+ * Generic, controller-independent functions:
+ */
+int show_interrupts(struct seq_file *p, void *v)
+{
+        int i = *(loff_t *) v, j;
+        struct irqaction * action;
+        unsigned long flags;
+        if (i == 0) {
+                seq_printf(p, "           ");
+                for_each_online_cpu(j)
+                        seq_printf(p, "CPU%-8d",j);
+                seq_putc(p, '\n');
+        }
+        if (i < NR_IRQS) {
+                spin_lock_irqsave(&irq_desc[i].lock, flags);
+                action = irq_desc[i].action;
+                if (!action) 
+                        goto skip;
+                seq_printf(p, "%3d: ",i);
+#ifndef CONFIG_SMP
+                seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+                for_each_online_cpu(j)
+                        seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+#endif
+                seq_printf(p, " %8s", irq_desc[i].chip->name);
+                seq_printf(p, "-%-8s", irq_desc[i].name);
+                seq_printf(p, "  %s", action->name);
+                for (action=action->next; action; action = action->next)
+                        seq_printf(p, ", %s", action->name);
+                seq_putc(p, '\n');
+skip:
+                spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+        } else if (i == NR_IRQS) {
+                seq_printf(p, "NMI: ");
+                for_each_online_cpu(j)
+                        seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
+                seq_putc(p, '\n');
+                seq_printf(p, "LOC: ");
+                for_each_online_cpu(j)
+                        seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
+                seq_putc(p, '\n');
+                seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+        }
+        return 0;
+}
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+{
+        struct pt_regs *old_regs = set_irq_regs(regs);
+        /* high bit used in ret_from_ code  */
+        unsigned vector = ~regs->orig_rax;
+        unsigned irq;
+        exit_idle();
+        irq_enter();
+        irq = __get_cpu_var(vector_irq)[vector];
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+        stack_overflow_check(regs);
+#endif
+        if (likely(irq < NR_IRQS))
+                generic_handle_irq(irq);
+        else {
+                if (!disable_apic)
+                        ack_APIC_irq();
+                if (printk_ratelimit())
+                        printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
+                                __func__, smp_processor_id(), vector);
+        }
+        irq_exit();
+        set_irq_regs(old_regs);
+        return 1;
+}
+#ifdef CONFIG_HOTPLUG_CPU
+void fixup_irqs(cpumask_t map)
+{
+        unsigned int irq;
+        static int warned;
+        for (irq = 0; irq < NR_IRQS; irq++) {
+                cpumask_t mask;
+                int break_affinity = 0;
+                int set_affinity = 1;
+                if (irq == 2)
+                        continue;
+                /* interrupt's are disabled at this point */
+                spin_lock(&irq_desc[irq].lock);
+                if (!irq_has_action(irq) ||
+                    cpus_equal(irq_desc[irq].affinity, map)) {
+                        spin_unlock(&irq_desc[irq].lock);
+                        continue;
+                }
+                cpus_and(mask, irq_desc[irq].affinity, map);
+                if (cpus_empty(mask)) {
+                        break_affinity = 1;
+                        mask = map;
+                }
+                if (irq_desc[irq].chip->mask)
+                        irq_desc[irq].chip->mask(irq);
+                if (irq_desc[irq].chip->set_affinity)
+                        irq_desc[irq].chip->set_affinity(irq, mask);
+                else if (!(warned++))
+                        set_affinity = 0;
+                if (irq_desc[irq].chip->unmask)
+                        irq_desc[irq].chip->unmask(irq);
+                spin_unlock(&irq_desc[irq].lock);
+                if (break_affinity && set_affinity)
+                        printk("Broke affinity for irq %i\n", irq);
+                else if (!set_affinity)
+                        printk("Cannot set affinity for irq %i\n", irq);
+        }
+        /* That doesn't seem sufficient.  Give it 1ms. */
+        local_irq_enable();
+        mdelay(1);
+        local_irq_disable();
+}
+#endif
+extern void call_softirq(void);
+asmlinkage void do_softirq(void)
+{
+        __u32 pending;
+        unsigned long flags;
+        if (in_interrupt())
+                return;
+        local_irq_save(flags);
+        pending = local_softirq_pending();
+        /* Switch to interrupt stack */
+        if (pending) {
+                call_softirq();
+                WARN_ON_ONCE(softirq_count());
+        }
+        local_irq_restore(flags);
+}
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
new file mode 100644
index 000000000000..7377ccb21335
--- /dev/null
+++ b/arch/x86/kernel/k8.c
@@ -0,0 +1,123 @@
+/*
+ * Shared support code for AMD K8 northbridges and derivates.
+ * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
+ */
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/k8.h>
+int num_k8_northbridges;
+EXPORT_SYMBOL(num_k8_northbridges);
+static u32 *flush_words;
+struct pci_device_id k8_nb_ids[] = {
+        { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
+        { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
+        {}
+};
+EXPORT_SYMBOL(k8_nb_ids);
+struct pci_dev **k8_northbridges;
+EXPORT_SYMBOL(k8_northbridges);
+static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
+{
+        do {
+                dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+                if (!dev)
+                        break;
+        } while (!pci_match_id(&k8_nb_ids[0], dev));
+        return dev;
+}
+int cache_k8_northbridges(void)
+{
+        int i;
+        struct pci_dev *dev;
+        if (num_k8_northbridges)
+                return 0;
+        dev = NULL;
+        while ((dev = next_k8_northbridge(dev)) != NULL)
+                num_k8_northbridges++;
+        k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
+                                  GFP_KERNEL);
+        if (!k8_northbridges)
+                return -ENOMEM;
+        if (!num_k8_northbridges) {
+                k8_northbridges[0] = NULL;
+                return 0;
+        }
+        flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
+        if (!flush_words) {
+                kfree(k8_northbridges);
+                return -ENOMEM;
+        }
+        dev = NULL;
+        i = 0;
+        while ((dev = next_k8_northbridge(dev)) != NULL) {
+                k8_northbridges[i] = dev;
+                pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
+        }
+        k8_northbridges[i] = NULL;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(cache_k8_northbridges);
+/* Ignores subdevice/subvendor but as far as I can figure out
+   they're useless anyways */
+int __init early_is_k8_nb(u32 device)
+{
+        struct pci_device_id *id;
+        u32 vendor = device & 0xffff;
+        device >>= 16;
+        for (id = k8_nb_ids; id->vendor; id++)
+                if (vendor == id->vendor && device == id->device)
+                        return 1;
+        return 0;
+}
+void k8_flush_garts(void)
+{
+        int flushed, i;
+        unsigned long flags;
+        static DEFINE_SPINLOCK(gart_lock);
+        /* Avoid races between AGP and IOMMU. In theory it's not needed
+           but I'm not sure if the hardware won't lose flush requests
+           when another is pending. This whole thing is so expensive anyways
+           that it doesn't matter to serialize more. -AK */
+        spin_lock_irqsave(&gart_lock, flags);
+        flushed = 0;
+        for (i = 0; i < num_k8_northbridges; i++) {
+                pci_write_config_dword(k8_northbridges[i], 0x9c,
+                                       flush_words[i]|1);
+                flushed++;
+        }
+        for (i = 0; i < num_k8_northbridges; i++) {
+                u32 w;
+                /* Make sure the hardware actually executed the flush*/
+                for (;;) {
+                        pci_read_config_dword(k8_northbridges[i],
+                                              0x9c, &w);
+                        if (!(w & 1))
+                                break;
+                        cpu_relax();
+                }
+        }
+        spin_unlock_irqrestore(&gart_lock, flags);
+        if (!flushed)
+                printk("nothing to flush?\n");
+}
+EXPORT_SYMBOL_GPL(k8_flush_garts);
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
new file mode 100644
index 000000000000..448a50b1324c
--- /dev/null
+++ b/arch/x86/kernel/kprobes_32.c
@@ -0,0 +1,751 @@
+/*
+ *  Kernel Probes (KProbes)
+ *  arch/i386/kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct     Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *              Probes initial implementation ( includes contributions from
+ *              Rusty Russell).
+ * 2004-July    Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *              interface to access function arguments.
+ * 2005-May     Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ *              <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ *              <prasanna@in.ibm.com> added function-return probes.
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/kdebug.h>
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <asm/uaccess.h>
+#include <asm/alternative.h>
+void jprobe_return_end(void);
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+/* insert a jmp code */
+static __always_inline void set_jmp_op(void *from, void *to)
+{
+        struct __arch_jmp_op {
+                char op;
+                long raddr;
+        } __attribute__((packed)) *jop;
+        jop = (struct __arch_jmp_op *)from;
+        jop->raddr = (long)(to) - ((long)(from) + 5);
+        jop->op = RELATIVEJUMP_INSTRUCTION;
+}
+/*
+ * returns non-zero if opcodes can be boosted.
+ */
+static __always_inline int can_boost(kprobe_opcode_t *opcodes)
+{
+#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)                \
+        (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
+          (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
+          (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
+          (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
+         << (row % 32))
+        /*
+         * Undefined/reserved opcodes, conditional jump, Opcode Extension
+         * Groups, and some special opcodes can not be boost.
+         */
+        static const unsigned long twobyte_is_boostable[256 / 32] = {
+                /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+                /*      -------------------------------         */
+                W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
+                W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
+                W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
+                W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
+                W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
+                W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
+                W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
+                W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
+                W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
+                W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
+                W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
+                W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
+                W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
+                W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
+                W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
+                W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0)  /* f0 */
+                /*      -------------------------------         */
+                /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+        };
+#undef W
+        kprobe_opcode_t opcode;
+        kprobe_opcode_t *orig_opcodes = opcodes;
+retry:
+        if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+                return 0;
+        opcode = *(opcodes++);
+        /* 2nd-byte opcode */
+        if (opcode == 0x0f) {
+                if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+                        return 0;
+                return test_bit(*opcodes, twobyte_is_boostable);
+        }
+        switch (opcode & 0xf0) {
+        case 0x60:
+                if (0x63 < opcode && opcode < 0x67)
+                        goto retry; /* prefixes */
+                /* can't boost Address-size override and bound */
+                return (opcode != 0x62 && opcode != 0x67);
+        case 0x70:
+                return 0; /* can't boost conditional jump */
+        case 0xc0:
+                /* can't boost software-interruptions */
+                return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
+        case 0xd0:
+                /* can boost AA* and XLAT */
+                return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
+        case 0xe0:
+                /* can boost in/out and absolute jmps */
+                return ((opcode & 0x04) || opcode == 0xea);
+        case 0xf0:
+                if ((opcode & 0x0c) == 0 && opcode != 0xf1)
+                        goto retry; /* lock/rep(ne) prefix */
+                /* clear and set flags can be boost */
+                return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
+        default:
+                if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
+                        goto retry; /* prefixes */
+                /* can't boost CS override and call */
+                return (opcode != 0x2e && opcode != 0x9a);
+        }
+}
+/*
+ * returns non-zero if opcode modifies the interrupt flag.
+ */
+static int __kprobes is_IF_modifier(kprobe_opcode_t opcode)
+{
+        switch (opcode) {
+        case 0xfa:              /* cli */
+        case 0xfb:              /* sti */
+        case 0xcf:              /* iret/iretd */
+        case 0x9d:              /* popf/popfd */
+                return 1;
+        }
+        return 0;
+}
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+        /* insn: must be on special executable page on i386. */
+        p->ainsn.insn = get_insn_slot();
+        if (!p->ainsn.insn)
+                return -ENOMEM;
+        memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+        p->opcode = *p->addr;
+        if (can_boost(p->addr)) {
+                p->ainsn.boostable = 0;
+        } else {
+                p->ainsn.boostable = -1;
+        }
+        return 0;
+}
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+        text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+}
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
+{
+        text_poke(p->addr, &p->opcode, 1);
+}
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+        mutex_lock(&kprobe_mutex);
+        free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
+        mutex_unlock(&kprobe_mutex);
+}
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+        kcb->prev_kprobe.kp = kprobe_running();
+        kcb->prev_kprobe.status = kcb->kprobe_status;
+        kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags;
+        kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags;
+}
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+        __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+        kcb->kprobe_status = kcb->prev_kprobe.status;
+        kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags;
+        kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags;
+}
+static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+                                struct kprobe_ctlblk *kcb)
+{
+        __get_cpu_var(current_kprobe) = p;
+        kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
+                = (regs->eflags & (TF_MASK | IF_MASK));
+        if (is_IF_modifier(p->opcode))
+                kcb->kprobe_saved_eflags &= ~IF_MASK;
+}
+static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+        regs->eflags |= TF_MASK;
+        regs->eflags &= ~IF_MASK;
+        /*single step inline if the instruction is an int3*/
+        if (p->opcode == BREAKPOINT_INSTRUCTION)
+                regs->eip = (unsigned long)p->addr;
+        else
+                regs->eip = (unsigned long)p->ainsn.insn;
+}
+/* Called with kretprobe_lock held */
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+                                      struct pt_regs *regs)
+{
+        unsigned long *sara = (unsigned long *)&regs->esp;
+        ri->ret_addr = (kprobe_opcode_t *) *sara;
+        /* Replace the return addr with trampoline addr */
+        *sara = (unsigned long) &kretprobe_trampoline;
+}
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+static int __kprobes kprobe_handler(struct pt_regs *regs)
+{
+        struct kprobe *p;
+        int ret = 0;
+        kprobe_opcode_t *addr;
+        struct kprobe_ctlblk *kcb;
+        addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
+        /*
+         * We don't want to be preempted for the entire
+         * duration of kprobe processing
+         */
+        preempt_disable();
+        kcb = get_kprobe_ctlblk();
+        /* Check we're not actually recursing */
+        if (kprobe_running()) {
+                p = get_kprobe(addr);
+                if (p) {
+                        if (kcb->kprobe_status == KPROBE_HIT_SS &&
+                                *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
+                                regs->eflags &= ~TF_MASK;
+                                regs->eflags |= kcb->kprobe_saved_eflags;
+                                goto no_kprobe;
+                        }
+                        /* We have reentered the kprobe_handler(), since
+                         * another probe was hit while within the handler.
+                         * We here save the original kprobes variables and
+                         * just single step on the instruction of the new probe
+                         * without calling any user handlers.
+                         */
+                        save_previous_kprobe(kcb);
+                        set_current_kprobe(p, regs, kcb);
+                        kprobes_inc_nmissed_count(p);
+                        prepare_singlestep(p, regs);
+                        kcb->kprobe_status = KPROBE_REENTER;
+                        return 1;
+                } else {
+                        if (*addr != BREAKPOINT_INSTRUCTION) {
+                        /* The breakpoint instruction was removed by
+                         * another cpu right after we hit, no further
+                         * handling of this interrupt is appropriate
+                         */
+                                regs->eip -= sizeof(kprobe_opcode_t);
+                                ret = 1;
+                                goto no_kprobe;
+                        }
+                        p = __get_cpu_var(current_kprobe);
+                        if (p->break_handler && p->break_handler(p, regs)) {
+                                goto ss_probe;
+                        }
+                }
+                goto no_kprobe;
+        }
+        p = get_kprobe(addr);
+        if (!p) {
+                if (*addr != BREAKPOINT_INSTRUCTION) {
+                        /*
+                         * The breakpoint instruction was removed right
+                         * after we hit it.  Another cpu has removed
+                         * either a probepoint or a debugger breakpoint
+                         * at this address.  In either case, no further
+                         * handling of this interrupt is appropriate.
+                         * Back up over the (now missing) int3 and run
+                         * the original instruction.
+                         */
+                        regs->eip -= sizeof(kprobe_opcode_t);
+                        ret = 1;
+                }
+                /* Not one of ours: let kernel handle it */
+                goto no_kprobe;
+        }
+        set_current_kprobe(p, regs, kcb);
+        kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+        if (p->pre_handler && p->pre_handler(p, regs))
+                /* handler has already set things up, so skip ss setup */
+                return 1;
+ss_probe:
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
+        if (p->ainsn.boostable == 1 && !p->post_handler){
+                /* Boost up -- we can execute copied instructions directly */
+                reset_current_kprobe();
+                regs->eip = (unsigned long)p->ainsn.insn;
+                preempt_enable_no_resched();
+                return 1;
+        }
+#endif
+        prepare_singlestep(p, regs);
+        kcb->kprobe_status = KPROBE_HIT_SS;
+        return 1;
+no_kprobe:
+        preempt_enable_no_resched();
+        return ret;
+}
+/*
+ * For function-return probes, init_kprobes() establishes a probepoint
+ * here. When a retprobed function returns, this probe is hit and
+ * trampoline_probe_handler() runs, calling the kretprobe's handler.
+ */
+ void __kprobes kretprobe_trampoline_holder(void)
+ {
+        asm volatile ( ".global kretprobe_trampoline\n"
+                        "kretprobe_trampoline: \n"
+                        "       pushf\n"
+                        /* skip cs, eip, orig_eax */
+                        "       subl $12, %esp\n"
+                        "       pushl %fs\n"
+                        "       pushl %ds\n"
+                        "       pushl %es\n"
+                        "       pushl %eax\n"
+                        "       pushl %ebp\n"
+                        "       pushl %edi\n"
+                        "       pushl %esi\n"
+                        "       pushl %edx\n"
+                        "       pushl %ecx\n"
+                        "       pushl %ebx\n"
+                        "       movl %esp, %eax\n"
+                        "       call trampoline_handler\n"
+                        /* move eflags to cs */
+                        "       movl 52(%esp), %edx\n"
+                        "       movl %edx, 48(%esp)\n"
+                        /* save true return address on eflags */
+                        "       movl %eax, 52(%esp)\n"
+                        "       popl %ebx\n"
+                        "       popl %ecx\n"
+                        "       popl %edx\n"
+                        "       popl %esi\n"
+                        "       popl %edi\n"
+                        "       popl %ebp\n"
+                        "       popl %eax\n"
+                        /* skip eip, orig_eax, es, ds, fs */
+                        "       addl $20, %esp\n"
+                        "       popf\n"
+                        "       ret\n");
+}
+/*
+ * Called from kretprobe_trampoline
+ */
+fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
+{
+        struct kretprobe_instance *ri = NULL;
+        struct hlist_head *head, empty_rp;
+        struct hlist_node *node, *tmp;
+        unsigned long flags, orig_ret_address = 0;
+        unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
+        INIT_HLIST_HEAD(&empty_rp);
+        spin_lock_irqsave(&kretprobe_lock, flags);
+        head = kretprobe_inst_table_head(current);
+        /* fixup registers */
+        regs->xcs = __KERNEL_CS | get_kernel_rpl();
+        regs->eip = trampoline_address;
+        regs->orig_eax = 0xffffffff;
+        /*
+         * It is possible to have multiple instances associated with a given
+         * task either because an multiple functions in the call path
+         * have a return probe installed on them, and/or more then one return
+         * return probe was registered for a target function.
+         *
+         * We can handle this because:
+         *     - instances are always inserted at the head of the list
+         *     - when multiple return probes are registered for the same
+         *       function, the first instance's ret_addr will point to the
+         *       real return address, and all the rest will point to
+         *       kretprobe_trampoline
+         */
+        hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+                if (ri->task != current)
+                        /* another task is sharing our hash bucket */
+                        continue;
+                if (ri->rp && ri->rp->handler){
+                        __get_cpu_var(current_kprobe) = &ri->rp->kp;
+                        get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+                        ri->rp->handler(ri, regs);
+                        __get_cpu_var(current_kprobe) = NULL;
+                }
+                orig_ret_address = (unsigned long)ri->ret_addr;
+                recycle_rp_inst(ri, &empty_rp);
+                if (orig_ret_address != trampoline_address)
+                        /*
+                         * This is the real return address. Any other
+                         * instances associated with this task are for
+                         * other calls deeper on the call stack
+                         */
+                        break;
+        }
+        kretprobe_assert(ri, orig_ret_address, trampoline_address);
+        spin_unlock_irqrestore(&kretprobe_lock, flags);
+        hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+                hlist_del(&ri->hlist);
+                kfree(ri);
+        }
+        return (void*)orig_ret_address;
+}
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int 3"
+ * instruction.  To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction.  The address of this
+ * copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * interrupt.  We have to fix up the stack as follows:
+ *
+ * 0) Except in the case of absolute or indirect jump or call instructions,
+ * the new eip is relative to the copied instruction.  We need to make
+ * it relative to the original instruction.
+ *
+ * 1) If the single-stepped instruction was pushfl, then the TF and IF
+ * flags are set in the just-pushed eflags, and may need to be cleared.
+ *
+ * 2) If the single-stepped instruction was a call, the return address
+ * that is atop the stack is the address following the copied instruction.
+ * We need to make it the address following the original instruction.
+ *
+ * This function also checks instruction size for preparing direct execution.
+ */
+static void __kprobes resume_execution(struct kprobe *p,
+                struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+{
+        unsigned long *tos = (unsigned long *)&regs->esp;
+        unsigned long copy_eip = (unsigned long)p->ainsn.insn;
+        unsigned long orig_eip = (unsigned long)p->addr;
+        regs->eflags &= ~TF_MASK;
+        switch (p->ainsn.insn[0]) {
+        case 0x9c:              /* pushfl */
+                *tos &= ~(TF_MASK | IF_MASK);
+                *tos |= kcb->kprobe_old_eflags;
+                break;
+        case 0xc2:              /* iret/ret/lret */
+        case 0xc3:
+        case 0xca:
+        case 0xcb:
+        case 0xcf:
+        case 0xea:              /* jmp absolute -- eip is correct */
+                /* eip is already adjusted, no more changes required */
+                p->ainsn.boostable = 1;
+                goto no_change;
+        case 0xe8:              /* call relative - Fix return addr */
+                *tos = orig_eip + (*tos - copy_eip);
+                break;
+        case 0x9a:              /* call absolute -- same as call absolute, indirect */
+                *tos = orig_eip + (*tos - copy_eip);
+                goto no_change;
+        case 0xff:
+                if ((p->ainsn.insn[1] & 0x30) == 0x10) {
+                        /*
+                         * call absolute, indirect
+                         * Fix return addr; eip is correct.
+                         * But this is not boostable
+                         */
+                        *tos = orig_eip + (*tos - copy_eip);
+                        goto no_change;
+                } else if (((p->ainsn.insn[1] & 0x31) == 0x20) ||       /* jmp near, absolute indirect */
+                           ((p->ainsn.insn[1] & 0x31) == 0x21)) {       /* jmp far, absolute indirect */
+                        /* eip is correct. And this is boostable */
+                        p->ainsn.boostable = 1;
+                        goto no_change;
+                }
+        default:
+                break;
+        }
+        if (p->ainsn.boostable == 0) {
+                if ((regs->eip > copy_eip) &&
+                    (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) {
+                        /*
+                         * These instructions can be executed directly if it
+                         * jumps back to correct address.
+                         */
+                        set_jmp_op((void *)regs->eip,
+                                   (void *)orig_eip + (regs->eip - copy_eip));
+                        p->ainsn.boostable = 1;
+                } else {
+                        p->ainsn.boostable = -1;
+                }
+        }
+        regs->eip = orig_eip + (regs->eip - copy_eip);
+no_change:
+        return;
+}
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thoroughout this function.
+ */
+static int __kprobes post_kprobe_handler(struct pt_regs *regs)
+{
+        struct kprobe *cur = kprobe_running();
+        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+        if (!cur)
+                return 0;
+        if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+                kcb->kprobe_status = KPROBE_HIT_SSDONE;
+                cur->post_handler(cur, regs, 0);
+        }
+        resume_execution(cur, regs, kcb);
+        regs->eflags |= kcb->kprobe_saved_eflags;
+        /*Restore back the original saved kprobes variables and continue. */
+        if (kcb->kprobe_status == KPROBE_REENTER) {
+                restore_previous_kprobe(kcb);
+                goto out;
+        }
+        reset_current_kprobe();
+out:
+        preempt_enable_no_resched();
+        /*
+         * if somebody else is singlestepping across a probe point, eflags
+         * will have TF set, in which case, continue the remaining processing
+         * of do_debug, as if this is not a probe hit.
+         */
+        if (regs->eflags & TF_MASK)
+                return 0;
+        return 1;
+}
+static int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+        struct kprobe *cur = kprobe_running();
+        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+        switch(kcb->kprobe_status) {
+        case KPROBE_HIT_SS:
+        case KPROBE_REENTER:
+                /*
+                 * We are here because the instruction being single
+                 * stepped caused a page fault. We reset the current
+                 * kprobe and the eip points back to the probe address
+                 * and allow the page fault handler to continue as a
+                 * normal page fault.
+                 */
+                regs->eip = (unsigned long)cur->addr;
+                regs->eflags |= kcb->kprobe_old_eflags;
+                if (kcb->kprobe_status == KPROBE_REENTER)
+                        restore_previous_kprobe(kcb);
+                else
+                        reset_current_kprobe();
+                preempt_enable_no_resched();
+                break;
+        case KPROBE_HIT_ACTIVE:
+        case KPROBE_HIT_SSDONE:
+                /*
+                 * We increment the nmissed count for accounting,
+                 * we can also use npre/npostfault count for accouting
+                 * these specific fault cases.
+                 */
+                kprobes_inc_nmissed_count(cur);
+                /*
+                 * We come here because instructions in the pre/post
+                 * handler caused the page_fault, this could happen
+                 * if handler tries to access user space by
+                 * copy_from_user(), get_user() etc. Let the
+                 * user-specified handler try to fix it first.
+                 */
+                if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+                        return 1;
+                /*
+                 * In case the user-specified fault handler returned
+                 * zero, try to fix up.
+                 */
+                if (fixup_exception(regs))
+                        return 1;
+                /*
+                 * fixup_exception() could not handle it,
+                 * Let do_page_fault() fix it.
+                 */
+                break;
+        default:
+                break;
+        }
+        return 0;
+}
+/*
+ * Wrapper routine to for handling exceptions.
+ */
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+                                       unsigned long val, void *data)
+{
+        struct die_args *args = (struct die_args *)data;
+        int ret = NOTIFY_DONE;
+        if (args->regs && user_mode_vm(args->regs))
+                return ret;
+        switch (val) {
+        case DIE_INT3:
+                if (kprobe_handler(args->regs))
+                        ret = NOTIFY_STOP;
+                break;
+        case DIE_DEBUG:
+                if (post_kprobe_handler(args->regs))
+                        ret = NOTIFY_STOP;
+                break;
+        case DIE_GPF:
+        case DIE_PAGE_FAULT:
+                /* kprobe_running() needs smp_processor_id() */
+                preempt_disable();
+                if (kprobe_running() &&
+                    kprobe_fault_handler(args->regs, args->trapnr))
+                        ret = NOTIFY_STOP;
+                preempt_enable();
+                break;
+        default:
+                break;
+        }
+        return ret;
+}
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+        struct jprobe *jp = container_of(p, struct jprobe, kp);
+        unsigned long addr;
+        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+        kcb->jprobe_saved_regs = *regs;
+        kcb->jprobe_saved_esp = &regs->esp;
+        addr = (unsigned long)(kcb->jprobe_saved_esp);
+        /*
+         * TBD: As Linus pointed out, gcc assumes that the callee
+         * owns the argument space and could overwrite it, e.g.
+         * tailcall optimization. So, to be absolutely safe
+         * we also save and restore enough stack bytes to cover
+         * the argument area.
+         */
+        memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
+                        MIN_STACK_SIZE(addr));
+        regs->eflags &= ~IF_MASK;
+        regs->eip = (unsigned long)(jp->entry);
+        return 1;
+}
+void __kprobes jprobe_return(void)
+{
+        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+        asm volatile ("       xchgl   %%ebx,%%esp     \n"
+                      "       int3                      \n"
+                      "       .globl jprobe_return_end  \n"
+                      "       jprobe_return_end:        \n"
+                      "       nop                       \n"::"b"
+                      (kcb->jprobe_saved_esp):"memory");
+}
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+        u8 *addr = (u8 *) (regs->eip - 1);
+        unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
+        struct jprobe *jp = container_of(p, struct jprobe, kp);
+        if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
+                if (&regs->esp != kcb->jprobe_saved_esp) {
+                        struct pt_regs *saved_regs =
+                            container_of(kcb->jprobe_saved_esp,
+                                            struct pt_regs, esp);
+                        printk("current esp %p does not match saved esp %p\n",
+                               &regs->esp, kcb->jprobe_saved_esp);
+                        printk("Saved registers for jprobe %p\n", jp);
+                        show_registers(saved_regs);
+                        printk("Current registers\n");
+                        show_registers(regs);
+                        BUG();
+                }
+                *regs = kcb->jprobe_saved_regs;
+                memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
+                       MIN_STACK_SIZE(stack_addr));
+                preempt_enable_no_resched();
+                return 1;
+        }
+        return 0;
+}
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+        return 0;
+}
+int __init arch_init_kprobes(void)
+{
+        return 0;
+}
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
new file mode 100644
index 000000000000..a30e004682e2
--- /dev/null
+++ b/arch/x86/kernel/kprobes_64.c
@@ -0,0 +1,749 @@
+/*
+ *  Kernel Probes (KProbes)
+ *  arch/x86_64/kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct     Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *              Probes initial implementation ( includes contributions from
+ *              Rusty Russell).
+ * 2004-July    Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *              interface to access function arguments.
+ * 2004-Oct     Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
+ *              <prasanna@in.ibm.com> adapted for x86_64
+ * 2005-Mar     Roland McGrath <roland@redhat.com>
+ *              Fixed to handle %rip-relative addressing mode correctly.
+ * 2005-May     Rusty Lynch <rusty.lynch@intel.com>
+ *              Added function return probes functionality
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/alternative.h>
+void jprobe_return_end(void);
+static void __kprobes arch_copy_kprobe(struct kprobe *p);
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+/*
+ * returns non-zero if opcode modifies the interrupt flag.
+ */
+static __always_inline int is_IF_modifier(kprobe_opcode_t *insn)
+{
+        switch (*insn) {
+        case 0xfa:              /* cli */
+        case 0xfb:              /* sti */
+        case 0xcf:              /* iret/iretd */
+        case 0x9d:              /* popf/popfd */
+                return 1;
+        }
+        if (*insn  >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
+                return 1;
+        return 0;
+}
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+        /* insn: must be on special executable page on x86_64. */
+        p->ainsn.insn = get_insn_slot();
+        if (!p->ainsn.insn) {
+                return -ENOMEM;
+        }
+        arch_copy_kprobe(p);
+        return 0;
+}
+/*
+ * Determine if the instruction uses the %rip-relative addressing mode.
+ * If it does, return the address of the 32-bit displacement word.
+ * If not, return null.
+ */
+static s32 __kprobes *is_riprel(u8 *insn)
+{
+#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)                \
+        (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
+          (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
+          (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
+          (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
+         << (row % 64))
+        static const u64 onebyte_has_modrm[256 / 64] = {
+                /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+                /*      -------------------------------         */
+                W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
+                W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
+                W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
+                W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
+                W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
+                W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
+                W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
+                W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
+                W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
+                W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
+                W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
+                W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
+                W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
+                W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
+                W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
+                W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1)  /* f0 */
+                /*      -------------------------------         */
+                /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+        };
+        static const u64 twobyte_has_modrm[256 / 64] = {
+                /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+                /*      -------------------------------         */
+                W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
+                W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
+                W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
+                W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
+                W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
+                W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
+                W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
+                W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
+                W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
+                W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
+                W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
+                W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
+                W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
+                W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
+                W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
+                W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0)  /* ff */
+                /*      -------------------------------         */
+                /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+        };
+#undef  W
+        int need_modrm;
+        /* Skip legacy instruction prefixes.  */
+        while (1) {
+                switch (*insn) {
+                case 0x66:
+                case 0x67:
+                case 0x2e:
+                case 0x3e:
+                case 0x26:
+                case 0x64:
+                case 0x65:
+                case 0x36:
+                case 0xf0:
+                case 0xf3:
+                case 0xf2:
+                        ++insn;
+                        continue;
+                }
+                break;
+        }
+        /* Skip REX instruction prefix.  */
+        if ((*insn & 0xf0) == 0x40)
+                ++insn;
+        if (*insn == 0x0f) {    /* Two-byte opcode.  */
+                ++insn;
+                need_modrm = test_bit(*insn, twobyte_has_modrm);
+        } else {                /* One-byte opcode.  */
+                need_modrm = test_bit(*insn, onebyte_has_modrm);
+        }
+        if (need_modrm) {
+                u8 modrm = *++insn;
+                if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
+                        /* Displacement follows ModRM byte.  */
+                        return (s32 *) ++insn;
+                }
+        }
+        /* No %rip-relative addressing mode here.  */
+        return NULL;
+}
+static void __kprobes arch_copy_kprobe(struct kprobe *p)
+{
+        s32 *ripdisp;
+        memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
+        ripdisp = is_riprel(p->ainsn.insn);
+        if (ripdisp) {
+                /*
+                 * The copied instruction uses the %rip-relative
+                 * addressing mode.  Adjust the displacement for the
+                 * difference between the original location of this
+                 * instruction and the location of the copy that will
+                 * actually be run.  The tricky bit here is making sure
+                 * that the sign extension happens correctly in this
+                 * calculation, since we need a signed 32-bit result to
+                 * be sign-extended to 64 bits when it's added to the
+                 * %rip value and yield the same 64-bit result that the
+                 * sign-extension of the original signed 32-bit
+                 * displacement would have given.
+                 */
+                s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
+                BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
+                *ripdisp = disp;
+        }
+        p->opcode = *p->addr;
+}
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+        text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+}
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
+{
+        text_poke(p->addr, &p->opcode, 1);
+}
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+        mutex_lock(&kprobe_mutex);
+        free_insn_slot(p->ainsn.insn, 0);
+        mutex_unlock(&kprobe_mutex);
+}
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+        kcb->prev_kprobe.kp = kprobe_running();
+        kcb->prev_kprobe.status = kcb->kprobe_status;
+        kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
+        kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
+}
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+        __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+        kcb->kprobe_status = kcb->prev_kprobe.status;
+        kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
+        kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
+}
+static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+                                struct kprobe_ctlblk *kcb)
+{
+        __get_cpu_var(current_kprobe) = p;
+        kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
+                = (regs->eflags & (TF_MASK | IF_MASK));
+        if (is_IF_modifier(p->ainsn.insn))
+                kcb->kprobe_saved_rflags &= ~IF_MASK;
+}
+static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+        regs->eflags |= TF_MASK;
+        regs->eflags &= ~IF_MASK;
+        /*single step inline if the instruction is an int3*/
+        if (p->opcode == BREAKPOINT_INSTRUCTION)
+                regs->rip = (unsigned long)p->addr;
+        else
+                regs->rip = (unsigned long)p->ainsn.insn;
+}
+/* Called with kretprobe_lock held */
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+                                      struct pt_regs *regs)
+{
+        unsigned long *sara = (unsigned long *)regs->rsp;
+        ri->ret_addr = (kprobe_opcode_t *) *sara;
+        /* Replace the return addr with trampoline addr */
+        *sara = (unsigned long) &kretprobe_trampoline;
+}
+int __kprobes kprobe_handler(struct pt_regs *regs)
+{
+        struct kprobe *p;
+        int ret = 0;
+        kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
+        struct kprobe_ctlblk *kcb;
+        /*
+         * We don't want to be preempted for the entire
+         * duration of kprobe processing
+         */
+        preempt_disable();
+        kcb = get_kprobe_ctlblk();
+        /* Check we're not actually recursing */
+        if (kprobe_running()) {
+                p = get_kprobe(addr);
+                if (p) {
+                        if (kcb->kprobe_status == KPROBE_HIT_SS &&
+                                *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
+                                regs->eflags &= ~TF_MASK;
+                                regs->eflags |= kcb->kprobe_saved_rflags;
+                                goto no_kprobe;
+                        } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
+                                /* TODO: Provide re-entrancy from
+                                 * post_kprobes_handler() and avoid exception
+                                 * stack corruption while single-stepping on
+                                 * the instruction of the new probe.
+                                 */
+                                arch_disarm_kprobe(p);
+                                regs->rip = (unsigned long)p->addr;
+                                reset_current_kprobe();
+                                ret = 1;
+                        } else {
+                                /* We have reentered the kprobe_handler(), since
+                                 * another probe was hit while within the
+                                 * handler. We here save the original kprobe
+                                 * variables and just single step on instruction
+                                 * of the new probe without calling any user
+                                 * handlers.
+                                 */
+                                save_previous_kprobe(kcb);
+                                set_current_kprobe(p, regs, kcb);
+                                kprobes_inc_nmissed_count(p);
+                                prepare_singlestep(p, regs);
+                                kcb->kprobe_status = KPROBE_REENTER;
+                                return 1;
+                        }
+                } else {
+                        if (*addr != BREAKPOINT_INSTRUCTION) {
+                        /* The breakpoint instruction was removed by
+                         * another cpu right after we hit, no further
+                         * handling of this interrupt is appropriate
+                         */
+                                regs->rip = (unsigned long)addr;
+                                ret = 1;
+                                goto no_kprobe;
+                        }
+                        p = __get_cpu_var(current_kprobe);
+                        if (p->break_handler && p->break_handler(p, regs)) {
+                                goto ss_probe;
+                        }
+                }
+                goto no_kprobe;
+        }
+        p = get_kprobe(addr);
+        if (!p) {
+                if (*addr != BREAKPOINT_INSTRUCTION) {
+                        /*
+                         * The breakpoint instruction was removed right
+                         * after we hit it.  Another cpu has removed
+                         * either a probepoint or a debugger breakpoint
+                         * at this address.  In either case, no further
+                         * handling of this interrupt is appropriate.
+                         * Back up over the (now missing) int3 and run
+                         * the original instruction.
+                         */
+                        regs->rip = (unsigned long)addr;
+                        ret = 1;
+                }
+                /* Not one of ours: let kernel handle it */
+                goto no_kprobe;
+        }
+        set_current_kprobe(p, regs, kcb);
+        kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+        if (p->pre_handler && p->pre_handler(p, regs))
+                /* handler has already set things up, so skip ss setup */
+                return 1;
+ss_probe:
+        prepare_singlestep(p, regs);
+        kcb->kprobe_status = KPROBE_HIT_SS;
+        return 1;
+no_kprobe:
+        preempt_enable_no_resched();
+        return ret;
+}
+/*
+ * For function-return probes, init_kprobes() establishes a probepoint
+ * here. When a retprobed function returns, this probe is hit and
+ * trampoline_probe_handler() runs, calling the kretprobe's handler.
+ */
+ void kretprobe_trampoline_holder(void)
+ {
+        asm volatile (  ".global kretprobe_trampoline\n"
+                        "kretprobe_trampoline: \n"
+                        "nop\n");
+ }
+/*
+ * Called when we hit the probe point at kretprobe_trampoline
+ */
+int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+{
+        struct kretprobe_instance *ri = NULL;
+        struct hlist_head *head, empty_rp;
+        struct hlist_node *node, *tmp;
+        unsigned long flags, orig_ret_address = 0;
+        unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
+        INIT_HLIST_HEAD(&empty_rp);
+        spin_lock_irqsave(&kretprobe_lock, flags);
+        head = kretprobe_inst_table_head(current);
+        /*
+         * It is possible to have multiple instances associated with a given
+         * task either because an multiple functions in the call path
+         * have a return probe installed on them, and/or more then one return
+         * return probe was registered for a target function.
+         *
+         * We can handle this because:
+         *     - instances are always inserted at the head of the list
+         *     - when multiple return probes are registered for the same
+         *       function, the first instance's ret_addr will point to the
+         *       real return address, and all the rest will point to
+         *       kretprobe_trampoline
+         */
+        hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+                if (ri->task != current)
+                        /* another task is sharing our hash bucket */
+                        continue;
+                if (ri->rp && ri->rp->handler)
+                        ri->rp->handler(ri, regs);
+                orig_ret_address = (unsigned long)ri->ret_addr;
+                recycle_rp_inst(ri, &empty_rp);
+                if (orig_ret_address != trampoline_address)
+                        /*
+                         * This is the real return address. Any other
+                         * instances associated with this task are for
+                         * other calls deeper on the call stack
+                         */
+                        break;
+        }
+        kretprobe_assert(ri, orig_ret_address, trampoline_address);
+        regs->rip = orig_ret_address;
+        reset_current_kprobe();
+        spin_unlock_irqrestore(&kretprobe_lock, flags);
+        preempt_enable_no_resched();
+        hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+                hlist_del(&ri->hlist);
+                kfree(ri);
+        }
+        /*
+         * By returning a non-zero value, we are telling
+         * kprobe_handler() that we don't want the post_handler
+         * to run (and have re-enabled preemption)
+         */
+        return 1;
+}
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int 3"
+ * instruction.  To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction.  The address of this
+ * copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * interrupt.  We have to fix up the stack as follows:
+ *
+ * 0) Except in the case of absolute or indirect jump or call instructions,
+ * the new rip is relative to the copied instruction.  We need to make
+ * it relative to the original instruction.
+ *
+ * 1) If the single-stepped instruction was pushfl, then the TF and IF
+ * flags are set in the just-pushed eflags, and may need to be cleared.
+ *
+ * 2) If the single-stepped instruction was a call, the return address
+ * that is atop the stack is the address following the copied instruction.
+ * We need to make it the address following the original instruction.
+ */
+static void __kprobes resume_execution(struct kprobe *p,
+                struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+{
+        unsigned long *tos = (unsigned long *)regs->rsp;
+        unsigned long next_rip = 0;
+        unsigned long copy_rip = (unsigned long)p->ainsn.insn;
+        unsigned long orig_rip = (unsigned long)p->addr;
+        kprobe_opcode_t *insn = p->ainsn.insn;
+        /*skip the REX prefix*/
+        if (*insn >= 0x40 && *insn <= 0x4f)
+                insn++;
+        switch (*insn) {
+        case 0x9c:              /* pushfl */
+                *tos &= ~(TF_MASK | IF_MASK);
+                *tos |= kcb->kprobe_old_rflags;
+                break;
+        case 0xc3:              /* ret/lret */
+        case 0xcb:
+        case 0xc2:
+        case 0xca:
+                regs->eflags &= ~TF_MASK;
+                /* rip is already adjusted, no more changes required*/
+                return;
+        case 0xe8:              /* call relative - Fix return addr */
+                *tos = orig_rip + (*tos - copy_rip);
+                break;
+        case 0xff:
+                if ((insn[1] & 0x30) == 0x10) {
+                        /* call absolute, indirect */
+                        /* Fix return addr; rip is correct. */
+                        next_rip = regs->rip;
+                        *tos = orig_rip + (*tos - copy_rip);
+                } else if (((insn[1] & 0x31) == 0x20) ||        /* jmp near, absolute indirect */
+                           ((insn[1] & 0x31) == 0x21)) {        /* jmp far, absolute indirect */
+                        /* rip is correct. */
+                        next_rip = regs->rip;
+                }
+                break;
+        case 0xea:              /* jmp absolute -- rip is correct */
+                next_rip = regs->rip;
+                break;
+        default:
+                break;
+        }
+        regs->eflags &= ~TF_MASK;
+        if (next_rip) {
+                regs->rip = next_rip;
+        } else {
+                regs->rip = orig_rip + (regs->rip - copy_rip);
+        }
+}
+int __kprobes post_kprobe_handler(struct pt_regs *regs)
+{
+        struct kprobe *cur = kprobe_running();
+        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+        if (!cur)
+                return 0;
+        if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+                kcb->kprobe_status = KPROBE_HIT_SSDONE;
+                cur->post_handler(cur, regs, 0);
+        }
+        resume_execution(cur, regs, kcb);
+        regs->eflags |= kcb->kprobe_saved_rflags;
+        /* Restore the original saved kprobes variables and continue. */
+        if (kcb->kprobe_status == KPROBE_REENTER) {
+                restore_previous_kprobe(kcb);
+                goto out;
+        }
+        reset_current_kprobe();
+out:
+        preempt_enable_no_resched();
+        /*
+         * if somebody else is singlestepping across a probe point, eflags
+         * will have TF set, in which case, continue the remaining processing
+         * of do_debug, as if this is not a probe hit.
+         */
+        if (regs->eflags & TF_MASK)
+                return 0;
+        return 1;
+}
+int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+        struct kprobe *cur = kprobe_running();
+        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+        const struct exception_table_entry *fixup;
+        switch(kcb->kprobe_status) {
+        case KPROBE_HIT_SS:
+        case KPROBE_REENTER:
+                /*
+                 * We are here because the instruction being single
+                 * stepped caused a page fault. We reset the current
+                 * kprobe and the rip points back to the probe address
+                 * and allow the page fault handler to continue as a
+                 * normal page fault.
+                 */
+                regs->rip = (unsigned long)cur->addr;
+                regs->eflags |= kcb->kprobe_old_rflags;
+                if (kcb->kprobe_status == KPROBE_REENTER)
+                        restore_previous_kprobe(kcb);
+                else
+                        reset_current_kprobe();
+                preempt_enable_no_resched();
+                break;
+        case KPROBE_HIT_ACTIVE:
+        case KPROBE_HIT_SSDONE:
+                /*
+                 * We increment the nmissed count for accounting,
+                 * we can also use npre/npostfault count for accouting
+                 * these specific fault cases.
+                 */
+                kprobes_inc_nmissed_count(cur);
+                /*
+                 * We come here because instructions in the pre/post
+                 * handler caused the page_fault, this could happen
+                 * if handler tries to access user space by
+                 * copy_from_user(), get_user() etc. Let the
+                 * user-specified handler try to fix it first.
+                 */
+                if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+                        return 1;
+                /*
+                 * In case the user-specified fault handler returned
+                 * zero, try to fix up.
+                 */
+                fixup = search_exception_tables(regs->rip);
+                if (fixup) {
+                        regs->rip = fixup->fixup;
+                        return 1;
+                }
+                /*
+                 * fixup() could not handle it,
+                 * Let do_page_fault() fix it.
+                 */
+                break;
+        default:
+                break;
+        }
+        return 0;
+}
+/*
+ * Wrapper routine for handling exceptions.
+ */
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+                                       unsigned long val, void *data)
+{
+        struct die_args *args = (struct die_args *)data;
+        int ret = NOTIFY_DONE;
+        if (args->regs && user_mode(args->regs))
+                return ret;
+        switch (val) {
+        case DIE_INT3:
+                if (kprobe_handler(args->regs))
+                        ret = NOTIFY_STOP;
+                break;
+        case DIE_DEBUG:
+                if (post_kprobe_handler(args->regs))
+                        ret = NOTIFY_STOP;
+                break;
+        case DIE_GPF:
+        case DIE_PAGE_FAULT:
+                /* kprobe_running() needs smp_processor_id() */
+                preempt_disable();
+                if (kprobe_running() &&
+                    kprobe_fault_handler(args->regs, args->trapnr))
+                        ret = NOTIFY_STOP;
+                preempt_enable();
+                break;
+        default:
+                break;
+        }
+        return ret;
+}
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+        struct jprobe *jp = container_of(p, struct jprobe, kp);
+        unsigned long addr;
+        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+        kcb->jprobe_saved_regs = *regs;
+        kcb->jprobe_saved_rsp = (long *) regs->rsp;
+        addr = (unsigned long)(kcb->jprobe_saved_rsp);
+        /*
+         * As Linus pointed out, gcc assumes that the callee
+         * owns the argument space and could overwrite it, e.g.
+         * tailcall optimization. So, to be absolutely safe
+         * we also save and restore enough stack bytes to cover
+         * the argument area.
+         */
+        memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
+                        MIN_STACK_SIZE(addr));
+        regs->eflags &= ~IF_MASK;
+        regs->rip = (unsigned long)(jp->entry);
+        return 1;
+}
+void __kprobes jprobe_return(void)
+{
+        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+        asm volatile ("       xchg   %%rbx,%%rsp     \n"
+                      "       int3                      \n"
+                      "       .globl jprobe_return_end  \n"
+                      "       jprobe_return_end:        \n"
+                      "       nop                       \n"::"b"
+                      (kcb->jprobe_saved_rsp):"memory");
+}
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+        u8 *addr = (u8 *) (regs->rip - 1);
+        unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
+        struct jprobe *jp = container_of(p, struct jprobe, kp);
+        if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
+                if ((long *)regs->rsp != kcb->jprobe_saved_rsp) {
+                        struct pt_regs *saved_regs =
+                            container_of(kcb->jprobe_saved_rsp,
+                                            struct pt_regs, rsp);
+                        printk("current rsp %p does not match saved rsp %p\n",
+                               (long *)regs->rsp, kcb->jprobe_saved_rsp);
+                        printk("Saved registers for jprobe %p\n", jp);
+                        show_registers(saved_regs);
+                        printk("Current registers\n");
+                        show_registers(regs);
+                        BUG();
+                }
+                *regs = kcb->jprobe_saved_regs;
+                memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
+                       MIN_STACK_SIZE(stack_addr));
+                preempt_enable_no_resched();
+                return 1;
+        }
+        return 0;
+}
+static struct kprobe trampoline_p = {
+        .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
+        .pre_handler = trampoline_probe_handler
+};
+int __init arch_init_kprobes(void)
+{
+        return register_kprobe(&trampoline_p);
+}
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+        if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
+                return 1;
+        return 0;
+}
diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt_32.c
new file mode 100644
index 000000000000..e0b2d17f4f10
--- /dev/null
+++ b/arch/x86/kernel/ldt_32.c
@@ -0,0 +1,250 @@
+/*
+ * linux/arch/i386/kernel/ldt.c
+ *
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+static void flush_ldt(void *null)
+{
+        if (current->active_mm)
+                load_LDT(&current->active_mm->context);
+}
+#endif
+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+{
+        void *oldldt;
+        void *newldt;
+        int oldsize;
+        if (mincount <= pc->size)
+                return 0;
+        oldsize = pc->size;
+        mincount = (mincount+511)&(~511);
+        if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
+                newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+        else
+                newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+        if (!newldt)
+                return -ENOMEM;
+        if (oldsize)
+                memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+        oldldt = pc->ldt;
+        memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
+        pc->ldt = newldt;
+        wmb();
+        pc->size = mincount;
+        wmb();
+        if (reload) {
+#ifdef CONFIG_SMP
+                cpumask_t mask;
+                preempt_disable();
+                load_LDT(pc);
+                mask = cpumask_of_cpu(smp_processor_id());
+                if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+                        smp_call_function(flush_ldt, NULL, 1, 1);
+                preempt_enable();
+#else
+                load_LDT(pc);
+#endif
+        }
+        if (oldsize) {
+                if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+                        vfree(oldldt);
+                else
+                        kfree(oldldt);
+        }
+        return 0;
+}
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+        int err = alloc_ldt(new, old->size, 0);
+        if (err < 0)
+                return err;
+        memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+        return 0;
+}
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+        struct mm_struct * old_mm;
+        int retval = 0;
+        init_MUTEX(&mm->context.sem);
+        mm->context.size = 0;
+        old_mm = current->mm;
+        if (old_mm && old_mm->context.size > 0) {
+                down(&old_mm->context.sem);
+                retval = copy_ldt(&mm->context, &old_mm->context);
+                up(&old_mm->context.sem);
+        }
+        return retval;
+}
+/*
+ * No need to lock the MM as we are the last user
+ */
+void destroy_context(struct mm_struct *mm)
+{
+        if (mm->context.size) {
+                if (mm == current->active_mm)
+                        clear_LDT();
+                if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+                        vfree(mm->context.ldt);
+                else
+                        kfree(mm->context.ldt);
+                mm->context.size = 0;
+        }
+}
+static int read_ldt(void __user * ptr, unsigned long bytecount)
+{
+        int err;
+        unsigned long size;
+        struct mm_struct * mm = current->mm;
+        if (!mm->context.size)
+                return 0;
+        if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
+                bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+        down(&mm->context.sem);
+        size = mm->context.size*LDT_ENTRY_SIZE;
+        if (size > bytecount)
+                size = bytecount;
+        err = 0;
+        if (copy_to_user(ptr, mm->context.ldt, size))
+                err = -EFAULT;
+        up(&mm->context.sem);
+        if (err < 0)
+                goto error_return;
+        if (size != bytecount) {
+                /* zero-fill the rest */
+                if (clear_user(ptr+size, bytecount-size) != 0) {
+                        err = -EFAULT;
+                        goto error_return;
+                }
+        }
+        return bytecount;
+error_return:
+        return err;
+}
+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+{
+        int err;
+        unsigned long size;
+        err = 0;
+        size = 5*sizeof(struct desc_struct);
+        if (size > bytecount)
+                size = bytecount;
+        err = size;
+        if (clear_user(ptr, size))
+                err = -EFAULT;
+        return err;
+}
+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+{
+        struct mm_struct * mm = current->mm;
+        __u32 entry_1, entry_2;
+        int error;
+        struct user_desc ldt_info;
+        error = -EINVAL;
+        if (bytecount != sizeof(ldt_info))
+                goto out;
+        error = -EFAULT;        
+        if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
+                goto out;
+        error = -EINVAL;
+        if (ldt_info.entry_number >= LDT_ENTRIES)
+                goto out;
+        if (ldt_info.contents == 3) {
+                if (oldmode)
+                        goto out;
+                if (ldt_info.seg_not_present == 0)
+                        goto out;
+        }
+        down(&mm->context.sem);
+        if (ldt_info.entry_number >= mm->context.size) {
+                error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
+                if (error < 0)
+                        goto out_unlock;
+        }
+        /* Allow LDTs to be cleared by the user. */
+        if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+                if (oldmode || LDT_empty(&ldt_info)) {
+                        entry_1 = 0;
+                        entry_2 = 0;
+                        goto install;
+                }
+        }
+        entry_1 = LDT_entry_a(&ldt_info);
+        entry_2 = LDT_entry_b(&ldt_info);
+        if (oldmode)
+                entry_2 &= ~(1 << 20);
+        /* Install the new entry ...  */
+install:
+        write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2);
+        error = 0;
+out_unlock:
+        up(&mm->context.sem);
+out:
+        return error;
+}
+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+{
+        int ret = -ENOSYS;
+        switch (func) {
+        case 0:
+                ret = read_ldt(ptr, bytecount);
+                break;
+        case 1:
+                ret = write_ldt(ptr, bytecount, 1);
+                break;
+        case 2:
+                ret = read_default_ldt(ptr, bytecount);
+                break;
+        case 0x11:
+                ret = write_ldt(ptr, bytecount, 0);
+                break;
+        }
+        return ret;
+}
diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c
new file mode 100644
index 000000000000..bc9ffd5c19cc
--- /dev/null
+++ b/arch/x86/kernel/ldt_64.c
@@ -0,0 +1,252 @@
+/*
+ * linux/arch/x86_64/kernel/ldt.c
+ *
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Andi Kleen
+ * 
+ * This handles calls from both 32bit and 64bit mode.
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+static void flush_ldt(void *null)
+{
+        if (current->active_mm)
+               load_LDT(&current->active_mm->context);
+}
+#endif
+static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
+{
+        void *oldldt;
+        void *newldt;
+        unsigned oldsize;
+        if (mincount <= (unsigned)pc->size)
+                return 0;
+        oldsize = pc->size;
+        mincount = (mincount+511)&(~511);
+        if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
+                newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+        else
+                newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+        if (!newldt)
+                return -ENOMEM;
+        if (oldsize)
+                memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+        oldldt = pc->ldt;
+        memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
+        wmb();
+        pc->ldt = newldt;
+        wmb();
+        pc->size = mincount;
+        wmb();
+        if (reload) {
+#ifdef CONFIG_SMP
+                cpumask_t mask;
+                preempt_disable();
+                mask = cpumask_of_cpu(smp_processor_id());
+                load_LDT(pc);
+                if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+                        smp_call_function(flush_ldt, NULL, 1, 1);
+                preempt_enable();
+#else
+                load_LDT(pc);
+#endif
+        }
+        if (oldsize) {
+                if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+                        vfree(oldldt);
+                else
+                        kfree(oldldt);
+        }
+        return 0;
+}
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+        int err = alloc_ldt(new, old->size, 0);
+        if (err < 0)
+                return err;
+        memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+        return 0;
+}
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+        struct mm_struct * old_mm;
+        int retval = 0;
+        init_MUTEX(&mm->context.sem);
+        mm->context.size = 0;
+        old_mm = current->mm;
+        if (old_mm && old_mm->context.size > 0) {
+                down(&old_mm->context.sem);
+                retval = copy_ldt(&mm->context, &old_mm->context);
+                up(&old_mm->context.sem);
+        }
+        return retval;
+}
+/*
+ * 
+ * Don't touch the LDT register - we're already in the next thread.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+        if (mm->context.size) {
+                if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+                        vfree(mm->context.ldt);
+                else
+                        kfree(mm->context.ldt);
+                mm->context.size = 0;
+        }
+}
+static int read_ldt(void __user * ptr, unsigned long bytecount)
+{
+        int err;
+        unsigned long size;
+        struct mm_struct * mm = current->mm;
+        if (!mm->context.size)
+                return 0;
+        if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
+                bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+        down(&mm->context.sem);
+        size = mm->context.size*LDT_ENTRY_SIZE;
+        if (size > bytecount)
+                size = bytecount;
+        err = 0;
+        if (copy_to_user(ptr, mm->context.ldt, size))
+                err = -EFAULT;
+        up(&mm->context.sem);
+        if (err < 0)
+                goto error_return;
+        if (size != bytecount) {
+                /* zero-fill the rest */
+                if (clear_user(ptr+size, bytecount-size) != 0) {
+                        err = -EFAULT;
+                        goto error_return;
+                }
+        }
+        return bytecount;
+error_return:
+        return err;
+}
+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+{
+        /* Arbitrary number */ 
+        /* x86-64 default LDT is all zeros */
+        if (bytecount > 128) 
+                bytecount = 128;        
+        if (clear_user(ptr, bytecount))
+                return -EFAULT;
+        return bytecount; 
+}
+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+{
+        struct task_struct *me = current;
+        struct mm_struct * mm = me->mm;
+        __u32 entry_1, entry_2, *lp;
+        int error;
+        struct user_desc ldt_info;
+        error = -EINVAL;
+        if (bytecount != sizeof(ldt_info))
+                goto out;
+        error = -EFAULT;        
+        if (copy_from_user(&ldt_info, ptr, bytecount))
+                goto out;
+        error = -EINVAL;
+        if (ldt_info.entry_number >= LDT_ENTRIES)
+                goto out;
+        if (ldt_info.contents == 3) {
+                if (oldmode)
+                        goto out;
+                if (ldt_info.seg_not_present == 0)
+                        goto out;
+        }
+        down(&mm->context.sem);
+        if (ldt_info.entry_number >= (unsigned)mm->context.size) {
+                error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
+                if (error < 0)
+                        goto out_unlock;
+        }
+        lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
+        /* Allow LDTs to be cleared by the user. */
+        if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+                if (oldmode || LDT_empty(&ldt_info)) {
+                        entry_1 = 0;
+                        entry_2 = 0;
+                        goto install;
+                }
+        }
+        entry_1 = LDT_entry_a(&ldt_info);
+        entry_2 = LDT_entry_b(&ldt_info);
+        if (oldmode)
+                entry_2 &= ~(1 << 20);
+        /* Install the new entry ...  */
+install:
+        *lp     = entry_1;
+        *(lp+1) = entry_2;
+        error = 0;
+out_unlock:
+        up(&mm->context.sem);
+out:
+        return error;
+}
+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+{
+        int ret = -ENOSYS;
+        switch (func) {
+        case 0:
+                ret = read_ldt(ptr, bytecount);
+                break;
+        case 1:
+                ret = write_ldt(ptr, bytecount, 1);
+                break;
+        case 2:
+                ret = read_default_ldt(ptr, bytecount);
+                break;
+        case 0x11:
+                ret = write_ldt(ptr, bytecount, 0);
+                break;
+        }
+        return ret;
+}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
new file mode 100644
index 000000000000..91966bafb3dc
--- /dev/null
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -0,0 +1,171 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/desc.h>
+#include <asm/system.h>
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+static u32 kexec_pgd[1024] PAGE_ALIGNED;
+#ifdef CONFIG_X86_PAE
+static u32 kexec_pmd0[1024] PAGE_ALIGNED;
+static u32 kexec_pmd1[1024] PAGE_ALIGNED;
+#endif
+static u32 kexec_pte0[1024] PAGE_ALIGNED;
+static u32 kexec_pte1[1024] PAGE_ALIGNED;
+static void set_idt(void *newidt, __u16 limit)
+{
+        struct Xgt_desc_struct curidt;
+        /* ia32 supports unaliged loads & stores */
+        curidt.size    = limit;
+        curidt.address = (unsigned long)newidt;
+        load_idt(&curidt);
+};
+static void set_gdt(void *newgdt, __u16 limit)
+{
+        struct Xgt_desc_struct curgdt;
+        /* ia32 supports unaligned loads & stores */
+        curgdt.size    = limit;
+        curgdt.address = (unsigned long)newgdt;
+        load_gdt(&curgdt);
+};
+static void load_segments(void)
+{
+#define __STR(X) #X
+#define STR(X) __STR(X)
+        __asm__ __volatile__ (
+                "\tljmp $"STR(__KERNEL_CS)",$1f\n"
+                "\t1:\n"
+                "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+                "\tmovl %%eax,%%ds\n"
+                "\tmovl %%eax,%%es\n"
+                "\tmovl %%eax,%%fs\n"
+                "\tmovl %%eax,%%gs\n"
+                "\tmovl %%eax,%%ss\n"
+                ::: "eax", "memory");
+#undef STR
+#undef __STR
+}
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+        return 0;
+}
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+}
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+        unsigned long page_list[PAGES_NR];
+        void *control_page;
+        /* Interrupts aren't acceptable while we reboot */
+        local_irq_disable();
+        control_page = page_address(image->control_code_page);
+        memcpy(control_page, relocate_kernel, PAGE_SIZE);
+        page_list[PA_CONTROL_PAGE] = __pa(control_page);
+        page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+        page_list[PA_PGD] = __pa(kexec_pgd);
+        page_list[VA_PGD] = (unsigned long)kexec_pgd;
+#ifdef CONFIG_X86_PAE
+        page_list[PA_PMD_0] = __pa(kexec_pmd0);
+        page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
+        page_list[PA_PMD_1] = __pa(kexec_pmd1);
+        page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
+#endif
+        page_list[PA_PTE_0] = __pa(kexec_pte0);
+        page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
+        page_list[PA_PTE_1] = __pa(kexec_pte1);
+        page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+        /* The segment registers are funny things, they have both a
+         * visible and an invisible part.  Whenever the visible part is
+         * set to a specific selector, the invisible part is loaded
+         * with from a table in memory.  At no other time is the
+         * descriptor table in memory accessed.
+         *
+         * I take advantage of this here by force loading the
+         * segments, before I zap the gdt with an invalid value.
+         */
+        load_segments();
+        /* The gdt & idt are now invalid.
+         * If you want to load them you must set up your own idt & gdt.
+         */
+        set_gdt(phys_to_virt(0),0);
+        set_idt(phys_to_virt(0),0);
+        /* now call it */
+        relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+                        image->start, cpu_has_pae);
+}
+/* crashkernel=size@addr specifies the location to reserve for
+ * a crash kernel.  By reserving this memory we guarantee
+ * that linux never sets it up as a DMA target.
+ * Useful for holding code to do something appropriate
+ * after a kernel panic.
+ */
+static int __init parse_crashkernel(char *arg)
+{
+        unsigned long size, base;
+        size = memparse(arg, &arg);
+        if (*arg == '@') {
+                base = memparse(arg+1, &arg);
+                /* FIXME: Do I want a sanity check
+                 * to validate the memory range?
+                 */
+                crashk_res.start = base;
+                crashk_res.end   = base + size - 1;
+        }
+        return 0;
+}
+early_param("crashkernel", parse_crashkernel);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
new file mode 100644
index 000000000000..c3a554703672
--- /dev/null
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -0,0 +1,259 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/string.h>
+#include <linux/reboot.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+static u64 kexec_pgd[512] PAGE_ALIGNED;
+static u64 kexec_pud0[512] PAGE_ALIGNED;
+static u64 kexec_pmd0[512] PAGE_ALIGNED;
+static u64 kexec_pte0[512] PAGE_ALIGNED;
+static u64 kexec_pud1[512] PAGE_ALIGNED;
+static u64 kexec_pmd1[512] PAGE_ALIGNED;
+static u64 kexec_pte1[512] PAGE_ALIGNED;
+static void init_level2_page(pmd_t *level2p, unsigned long addr)
+{
+        unsigned long end_addr;
+        addr &= PAGE_MASK;
+        end_addr = addr + PUD_SIZE;
+        while (addr < end_addr) {
+                set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+                addr += PMD_SIZE;
+        }
+}
+static int init_level3_page(struct kimage *image, pud_t *level3p,
+                                unsigned long addr, unsigned long last_addr)
+{
+        unsigned long end_addr;
+        int result;
+        result = 0;
+        addr &= PAGE_MASK;
+        end_addr = addr + PGDIR_SIZE;
+        while ((addr < last_addr) && (addr < end_addr)) {
+                struct page *page;
+                pmd_t *level2p;
+                page = kimage_alloc_control_pages(image, 0);
+                if (!page) {
+                        result = -ENOMEM;
+                        goto out;
+                }
+                level2p = (pmd_t *)page_address(page);
+                init_level2_page(level2p, addr);
+                set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
+                addr += PUD_SIZE;
+        }
+        /* clear the unused entries */
+        while (addr < end_addr) {
+                pud_clear(level3p++);
+                addr += PUD_SIZE;
+        }
+out:
+        return result;
+}
+static int init_level4_page(struct kimage *image, pgd_t *level4p,
+                                unsigned long addr, unsigned long last_addr)
+{
+        unsigned long end_addr;
+        int result;
+        result = 0;
+        addr &= PAGE_MASK;
+        end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
+        while ((addr < last_addr) && (addr < end_addr)) {
+                struct page *page;
+                pud_t *level3p;
+                page = kimage_alloc_control_pages(image, 0);
+                if (!page) {
+                        result = -ENOMEM;
+                        goto out;
+                }
+                level3p = (pud_t *)page_address(page);
+                result = init_level3_page(image, level3p, addr, last_addr);
+                if (result) {
+                        goto out;
+                }
+                set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
+                addr += PGDIR_SIZE;
+        }
+        /* clear the unused entries */
+        while (addr < end_addr) {
+                pgd_clear(level4p++);
+                addr += PGDIR_SIZE;
+        }
+out:
+        return result;
+}
+static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+{
+        pgd_t *level4p;
+        level4p = (pgd_t *)__va(start_pgtable);
+        return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+}
+static void set_idt(void *newidt, u16 limit)
+{
+        struct desc_ptr curidt;
+        /* x86-64 supports unaliged loads & stores */
+        curidt.size    = limit;
+        curidt.address = (unsigned long)newidt;
+        __asm__ __volatile__ (
+                "lidtq %0\n"
+                : : "m" (curidt)
+                );
+};
+static void set_gdt(void *newgdt, u16 limit)
+{
+        struct desc_ptr curgdt;
+        /* x86-64 supports unaligned loads & stores */
+        curgdt.size    = limit;
+        curgdt.address = (unsigned long)newgdt;
+        __asm__ __volatile__ (
+                "lgdtq %0\n"
+                : : "m" (curgdt)
+                );
+};
+static void load_segments(void)
+{
+        __asm__ __volatile__ (
+                "\tmovl %0,%%ds\n"
+                "\tmovl %0,%%es\n"
+                "\tmovl %0,%%ss\n"
+                "\tmovl %0,%%fs\n"
+                "\tmovl %0,%%gs\n"
+                : : "a" (__KERNEL_DS) : "memory"
+                );
+}
+int machine_kexec_prepare(struct kimage *image)
+{
+        unsigned long start_pgtable;
+        int result;
+        /* Calculate the offsets */
+        start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+        /* Setup the identity mapped 64bit page table */
+        result = init_pgtable(image, start_pgtable);
+        if (result)
+                return result;
+        return 0;
+}
+void machine_kexec_cleanup(struct kimage *image)
+{
+        return;
+}
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+        unsigned long page_list[PAGES_NR];
+        void *control_page;
+        /* Interrupts aren't acceptable while we reboot */
+        local_irq_disable();
+        control_page = page_address(image->control_code_page) + PAGE_SIZE;
+        memcpy(control_page, relocate_kernel, PAGE_SIZE);
+        page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
+        page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+        page_list[PA_PGD] = virt_to_phys(&kexec_pgd);
+        page_list[VA_PGD] = (unsigned long)kexec_pgd;
+        page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0);
+        page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
+        page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0);
+        page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
+        page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0);
+        page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
+        page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1);
+        page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
+        page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1);
+        page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
+        page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1);
+        page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+        page_list[PA_TABLE_PAGE] =
+          (unsigned long)__pa(page_address(image->control_code_page));
+        /* The segment registers are funny things, they have both a
+         * visible and an invisible part.  Whenever the visible part is
+         * set to a specific selector, the invisible part is loaded
+         * with from a table in memory.  At no other time is the
+         * descriptor table in memory accessed.
+         *
+         * I take advantage of this here by force loading the
+         * segments, before I zap the gdt with an invalid value.
+         */
+        load_segments();
+        /* The gdt & idt are now invalid.
+         * If you want to load them you must set up your own idt & gdt.
+         */
+        set_gdt(phys_to_virt(0),0);
+        set_idt(phys_to_virt(0),0);
+        /* now call it */
+        relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+                        image->start);
+}
+/* crashkernel=size@addr specifies the location to reserve for
+ * a crash kernel.  By reserving this memory we guarantee
+ * that linux never set's it up as a DMA target.
+ * Useful for holding code to do something appropriate
+ * after a kernel panic.
+ */
+static int __init setup_crashkernel(char *arg)
+{
+        unsigned long size, base;
+        char *p;
+        if (!arg)
+                return -EINVAL;
+        size = memparse(arg, &p);
+        if (arg == p)
+                return -EINVAL;
+        if (*p == '@') {
+                base = memparse(p+1, &p);
+                /* FIXME: Do I want a sanity check to validate the
+                 * memory range?  Yes you do, but it's too early for
+                 * e820 -AK */
+                crashk_res.start = base;
+                crashk_res.end   = base + size - 1;
+        }
+        return 0;
+}
+early_param("crashkernel", setup_crashkernel);
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
new file mode 100644
index 000000000000..b83672b89527
--- /dev/null
+++ b/arch/x86/kernel/mca_32.c
@@ -0,0 +1,470 @@
+/*
+ *  linux/arch/i386/kernel/mca.c
+ *  Written by Martin Kolinek, February 1996
+ *
+ * Changes:
+ *
+ *      Chris Beauregard July 28th, 1996
+ *      - Fixed up integrated SCSI detection
+ *
+ *      Chris Beauregard August 3rd, 1996
+ *      - Made mca_info local
+ *      - Made integrated registers accessible through standard function calls
+ *      - Added name field
+ *      - More sanity checking
+ *
+ *      Chris Beauregard August 9th, 1996
+ *      - Rewrote /proc/mca
+ *
+ *      Chris Beauregard January 7th, 1997
+ *      - Added basic NMI-processing
+ *      - Added more information to mca_info structure
+ *
+ *      David Weinehall October 12th, 1998
+ *      - Made a lot of cleaning up in the source
+ *      - Added use of save_flags / restore_flags
+ *      - Added the 'driver_loaded' flag in MCA_adapter
+ *      - Added an alternative implemention of ZP Gu's mca_find_unused_adapter
+ *
+ *      David Weinehall March 24th, 1999
+ *      - Fixed the output of 'Driver Installed' in /proc/mca/pos
+ *      - Made the Integrated Video & SCSI show up even if they have id 0000
+ *
+ *      Alexander Viro November 9th, 1999
+ *      - Switched to regular procfs methods
+ *
+ *      Alfred Arnold & David Weinehall August 23rd, 2000
+ *      - Added support for Planar POS-registers
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mca.h>
+#include <linux/kprobes.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <linux/proc_fs.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/ioport.h>
+#include <asm/uaccess.h>
+#include <linux/init.h>
+#include <asm/arch_hooks.h>
+static unsigned char which_scsi = 0;
+int MCA_bus = 0;
+EXPORT_SYMBOL(MCA_bus);
+/*
+ * Motherboard register spinlock. Untested on SMP at the moment, but
+ * are there any MCA SMP boxes?
+ *
+ * Yes - Alan
+ */
+static DEFINE_SPINLOCK(mca_lock);
+/* Build the status info for the adapter */
+static void mca_configure_adapter_status(struct mca_device *mca_dev) {
+        mca_dev->status = MCA_ADAPTER_NONE;
+        mca_dev->pos_id = mca_dev->pos[0]
+                + (mca_dev->pos[1] << 8);
+        if(!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
+                /* id = 0x0000 usually indicates hardware failure,
+                 * however, ZP Gu (zpg@castle.net> reports that his 9556
+                 * has 0x0000 as id and everything still works. There
+                 * also seem to be an adapter with id = 0x0000; the
+                 * NCR Parallel Bus Memory Card. Until this is confirmed,
+                 * however, this code will stay.
+                 */
+                mca_dev->status = MCA_ADAPTER_ERROR;
+                return;
+        } else if(mca_dev->pos_id != 0xffff) {
+                /* 0xffff usually indicates that there's no adapter,
+                 * however, some integrated adapters may have 0xffff as
+                 * their id and still be valid. Examples are on-board
+                 * VGA of the 55sx, the integrated SCSI of the 56 & 57,
+                 * and possibly also the 95 ULTIMEDIA.
+                 */
+                mca_dev->status = MCA_ADAPTER_NORMAL;
+        }
+        if((mca_dev->pos_id == 0xffff ||
+            mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
+                int j;
+                for(j = 2; j < 8; j++) {
+                        if(mca_dev->pos[j] != 0xff) {
+                                mca_dev->status = MCA_ADAPTER_NORMAL;
+                                break;
+                        }
+                }
+        }
+        if(!(mca_dev->pos[2] & MCA_ENABLED)) {
+                /* enabled bit is in POS 2 */
+                mca_dev->status = MCA_ADAPTER_DISABLED;
+        }
+} /* mca_configure_adapter_status */
+/*--------------------------------------------------------------------*/
+static struct resource mca_standard_resources[] = {
+        { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
+        { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
+        { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
+        { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
+        { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
+        { .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
+        { .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
+};
+#define MCA_STANDARD_RESOURCES  ARRAY_SIZE(mca_standard_resources)
+/**
+ *      mca_read_and_store_pos - read the POS registers into a memory buffer
+ *      @pos: a char pointer to 8 bytes, contains the POS register value on
+ *            successful return
+ *
+ *      Returns 1 if a card actually exists (i.e. the pos isn't
+ *      all 0xff) or 0 otherwise
+ */
+static int mca_read_and_store_pos(unsigned char *pos) {
+        int j;
+        int found = 0;
+        for(j=0; j<8; j++) {
+                if((pos[j] = inb_p(MCA_POS_REG(j))) != 0xff) {
+                        /* 0xff all across means no device. 0x00 means
+                         * something's broken, but a device is
+                         * probably there.  However, if you get 0x00
+                         * from a motherboard register it won't matter
+                         * what we find.  For the record, on the
+                         * 57SLC, the integrated SCSI adapter has
+                         * 0xffff for the adapter ID, but nonzero for
+                         * other registers.  */
+                        found = 1;
+                }
+        }
+        return found;
+}
+static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
+{
+        unsigned char byte;
+        unsigned long flags;
+        if(reg < 0 || reg >= 8)
+                return 0;
+        spin_lock_irqsave(&mca_lock, flags);
+        if(mca_dev->pos_register) {
+                /* Disable adapter setup, enable motherboard setup */
+                outb_p(0, MCA_ADAPTER_SETUP_REG);
+                outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
+                byte = inb_p(MCA_POS_REG(reg));
+                outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
+        } else {
+                /* Make sure motherboard setup is off */
+                outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
+                /* Read the appropriate register */
+                outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
+                byte = inb_p(MCA_POS_REG(reg));
+                outb_p(0, MCA_ADAPTER_SETUP_REG);
+        }
+        spin_unlock_irqrestore(&mca_lock, flags);
+        mca_dev->pos[reg] = byte;
+        return byte;
+}
+static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
+                             unsigned char byte)
+{
+        unsigned long flags;
+        if(reg < 0 || reg >= 8)
+                return;
+        spin_lock_irqsave(&mca_lock, flags);
+        /* Make sure motherboard setup is off */
+        outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
+        /* Read in the appropriate register */
+        outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
+        outb_p(byte, MCA_POS_REG(reg));
+        outb_p(0, MCA_ADAPTER_SETUP_REG);
+        spin_unlock_irqrestore(&mca_lock, flags);
+        /* Update the global register list, while we have the byte */
+        mca_dev->pos[reg] = byte;
+}
+/* for the primary MCA bus, we have identity transforms */
+static int mca_dummy_transform_irq(struct mca_device * mca_dev, int irq)
+{
+        return irq;
+}
+static int mca_dummy_transform_ioport(struct mca_device * mca_dev, int port)
+{
+        return port;
+}
+static void *mca_dummy_transform_memory(struct mca_device * mca_dev, void *mem)
+{
+        return mem;
+}
+static int __init mca_init(void)
+{
+        unsigned int i, j;
+        struct mca_device *mca_dev;
+        unsigned char pos[8];
+        short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
+        struct mca_bus *bus;
+        /* WARNING: Be careful when making changes here. Putting an adapter
+         * and the motherboard simultaneously into setup mode may result in
+         * damage to chips (according to The Indispensible PC Hardware Book
+         * by Hans-Peter Messmer). Also, we disable system interrupts (so
+         * that we are not disturbed in the middle of this).
+         */
+        /* Make sure the MCA bus is present */
+        if (mca_system_init()) {
+                printk(KERN_ERR "MCA bus system initialisation failed\n");
+                return -ENODEV;
+        }
+        if (!MCA_bus)
+                return -ENODEV;
+        printk(KERN_INFO "Micro Channel bus detected.\n");
+        /* All MCA systems have at least a primary bus */
+        bus = mca_attach_bus(MCA_PRIMARY_BUS);
+        if (!bus)
+                goto out_nomem;
+        bus->default_dma_mask = 0xffffffffLL;
+        bus->f.mca_write_pos = mca_pc_write_pos;
+        bus->f.mca_read_pos = mca_pc_read_pos;
+        bus->f.mca_transform_irq = mca_dummy_transform_irq;
+        bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
+        bus->f.mca_transform_memory = mca_dummy_transform_memory;
+        /* get the motherboard device */
+        mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
+        if(unlikely(!mca_dev))
+                goto out_nomem;
+        /*
+         * We do not expect many MCA interrupts during initialization,
+         * but let us be safe:
+         */
+        spin_lock_irq(&mca_lock);
+        /* Make sure adapter setup is off */
+        outb_p(0, MCA_ADAPTER_SETUP_REG);
+        /* Read motherboard POS registers */
+        mca_dev->pos_register = 0x7f;
+        outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
+        mca_dev->name[0] = 0;
+        mca_read_and_store_pos(mca_dev->pos);
+        mca_configure_adapter_status(mca_dev);
+        /* fake POS and slot for a motherboard */
+        mca_dev->pos_id = MCA_MOTHERBOARD_POS;
+        mca_dev->slot = MCA_MOTHERBOARD;
+        mca_register_device(MCA_PRIMARY_BUS, mca_dev);
+        mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
+        if(unlikely(!mca_dev))
+                goto out_unlock_nomem;
+        /* Put motherboard into video setup mode, read integrated video
+         * POS registers, and turn motherboard setup off.
+         */
+        mca_dev->pos_register = 0xdf;
+        outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
+        mca_dev->name[0] = 0;
+        mca_read_and_store_pos(mca_dev->pos);
+        mca_configure_adapter_status(mca_dev);
+        /* fake POS and slot for the integrated video */
+        mca_dev->pos_id = MCA_INTEGVIDEO_POS;
+        mca_dev->slot = MCA_INTEGVIDEO;
+        mca_register_device(MCA_PRIMARY_BUS, mca_dev);
+        /* Put motherboard into scsi setup mode, read integrated scsi
+         * POS registers, and turn motherboard setup off.
+         *
+         * It seems there are two possible SCSI registers. Martin says that
+         * for the 56,57, 0xf7 is the one, but fails on the 76.
+         * Alfredo (apena@vnet.ibm.com) says
+         * 0xfd works on his machine. We'll try both of them. I figure it's
+         * a good bet that only one could be valid at a time. This could
+         * screw up though if one is used for something else on the other
+         * machine.
+         */
+        for(i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
+                outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
+                if(mca_read_and_store_pos(pos))
+                        break;
+        }
+        if(which_scsi) {
+                /* found a scsi card */
+                mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
+                if(unlikely(!mca_dev))
+                        goto out_unlock_nomem;
+                for(j = 0; j < 8; j++)
+                        mca_dev->pos[j] = pos[j];
+                mca_configure_adapter_status(mca_dev);
+                /* fake POS and slot for integrated SCSI controller */
+                mca_dev->pos_id = MCA_INTEGSCSI_POS;
+                mca_dev->slot = MCA_INTEGSCSI;
+                mca_dev->pos_register = which_scsi;
+                mca_register_device(MCA_PRIMARY_BUS, mca_dev);
+        }
+        /* Turn off motherboard setup */
+        outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
+        /* Now loop over MCA slots: put each adapter into setup mode, and
+         * read its POS registers. Then put adapter setup off.
+         */
+        for(i=0; i<MCA_MAX_SLOT_NR; i++) {
+                outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
+                if(!mca_read_and_store_pos(pos))
+                        continue;
+                mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
+                if(unlikely(!mca_dev))
+                        goto out_unlock_nomem;
+                for(j=0; j<8; j++)
+                        mca_dev->pos[j]=pos[j];
+                mca_dev->driver_loaded = 0;
+                mca_dev->slot = i;
+                mca_dev->pos_register = 0;
+                mca_configure_adapter_status(mca_dev);
+                mca_register_device(MCA_PRIMARY_BUS, mca_dev);
+        }
+        outb_p(0, MCA_ADAPTER_SETUP_REG);
+        /* Enable interrupts and return memory start */
+        spin_unlock_irq(&mca_lock);
+        for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
+                request_resource(&ioport_resource, mca_standard_resources + i);
+        mca_do_proc_init();
+        return 0;
+ out_unlock_nomem:
+        spin_unlock_irq(&mca_lock);
+ out_nomem:
+        printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
+        return -ENOMEM;
+}
+subsys_initcall(mca_init);
+/*--------------------------------------------------------------------*/
+static __kprobes void
+mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
+{
+        int slot = mca_dev->slot;
+        if(slot == MCA_INTEGSCSI) {
+                printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
+                        mca_dev->name);
+        } else if(slot == MCA_INTEGVIDEO) {
+                printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
+                        mca_dev->name);
+        } else if(slot == MCA_MOTHERBOARD) {
+                printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
+                        mca_dev->name);
+        }
+        /* More info available in POS 6 and 7? */
+        if(check_flag) {
+                unsigned char pos6, pos7;
+                pos6 = mca_device_read_pos(mca_dev, 6);
+                pos7 = mca_device_read_pos(mca_dev, 7);
+                printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
+        }
+} /* mca_handle_nmi_slot */
+/*--------------------------------------------------------------------*/
+static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
+{
+        struct mca_device *mca_dev = to_mca_device(dev);
+        unsigned char pos5;
+        pos5 = mca_device_read_pos(mca_dev, 5);
+        if(!(pos5 & 0x80)) {
+                /* Bit 7 of POS 5 is reset when this adapter has a hardware
+                 * error. Bit 7 it reset if there's error information
+                 * available in POS 6 and 7.
+                 */
+                mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
+                return 1;
+        }
+        return 0;
+}
+void __kprobes mca_handle_nmi(void)
+{
+        /* First try - scan the various adapters and see if a specific
+         * adapter was responsible for the error.
+         */
+        bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
+        mca_nmi_hook();
+} /* mca_handle_nmi */
diff --git a/arch/x86/kernel/mce_64.c b/arch/x86/kernel/mce_64.c
new file mode 100644
index 000000000000..a66d607f5b92
--- /dev/null
+++ b/arch/x86/kernel/mce_64.c
@@ -0,0 +1,875 @@
+/*
+ * Machine check handler.
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s). 
+ * 2004 Andi Kleen. Rewrote most of it. 
+ */
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
+#include <linux/sysdev.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
+#include <linux/poll.h>
+#include <linux/thread_info.h>
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/kdebug.h>
+#include <asm/processor.h> 
+#include <asm/msr.h>
+#include <asm/mce.h>
+#include <asm/uaccess.h>
+#include <asm/smp.h>
+#include <asm/idle.h>
+#define MISC_MCELOG_MINOR 227
+#define NR_BANKS 6
+atomic_t mce_entry;
+static int mce_dont_init;
+/*
+ * Tolerant levels:
+ *   0: always panic on uncorrected errors, log corrected errors
+ *   1: panic or SIGBUS on uncorrected errors, log corrected errors
+ *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
+ *   3: never panic or SIGBUS, log all errors (for testing only)
+ */
+static int tolerant = 1;
+static int banks;
+static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
+static unsigned long notify_user;
+static int rip_msr;
+static int mce_bootlog = 1;
+static atomic_t mce_events;
+static char trigger[128];
+static char *trigger_argv[2] = { trigger, NULL };
+static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+struct mce_log mcelog = { 
+        MCE_LOG_SIGNATURE,
+        MCE_LOG_LEN,
+}; 
+void mce_log(struct mce *mce)
+{
+        unsigned next, entry;
+        atomic_inc(&mce_events);
+        mce->finished = 0;
+        wmb();
+        for (;;) {
+                entry = rcu_dereference(mcelog.next);
+                /* The rmb forces the compiler to reload next in each
+                    iteration */
+                rmb();
+                for (;;) {
+                        /* When the buffer fills up discard new entries. Assume
+                           that the earlier errors are the more interesting. */
+                        if (entry >= MCE_LOG_LEN) {
+                                set_bit(MCE_OVERFLOW, &mcelog.flags);
+                                return;
+                        }
+                        /* Old left over entry. Skip. */
+                        if (mcelog.entry[entry].finished) {
+                                entry++;
+                                continue;
+                        }
+                        break;
+                }
+                smp_rmb();
+                next = entry + 1;
+                if (cmpxchg(&mcelog.next, entry, next) == entry)
+                        break;
+        }
+        memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+        wmb();
+        mcelog.entry[entry].finished = 1;
+        wmb();
+        set_bit(0, &notify_user);
+}
+static void print_mce(struct mce *m)
+{
+        printk(KERN_EMERG "\n"
+               KERN_EMERG "HARDWARE ERROR\n"
+               KERN_EMERG
+               "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+               m->cpu, m->mcgstatus, m->bank, m->status);
+        if (m->rip) {
+                printk(KERN_EMERG 
+                       "RIP%s %02x:<%016Lx> ",
+                       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+                       m->cs, m->rip);
+                if (m->cs == __KERNEL_CS)
+                        print_symbol("{%s}", m->rip);
+                printk("\n");
+        }
+        printk(KERN_EMERG "TSC %Lx ", m->tsc); 
+        if (m->addr)
+                printk("ADDR %Lx ", m->addr);
+        if (m->misc)
+                printk("MISC %Lx ", m->misc);   
+        printk("\n");
+        printk(KERN_EMERG "This is not a software problem!\n");
+        printk(KERN_EMERG
+    "Run through mcelog --ascii to decode and contact your hardware vendor\n");
+}
+static void mce_panic(char *msg, struct mce *backup, unsigned long start)
+{ 
+        int i;
+        oops_begin();
+        for (i = 0; i < MCE_LOG_LEN; i++) {
+                unsigned long tsc = mcelog.entry[i].tsc;
+                if (time_before(tsc, start))
+                        continue;
+                print_mce(&mcelog.entry[i]); 
+                if (backup && mcelog.entry[i].tsc == backup->tsc)
+                        backup = NULL;
+        }
+        if (backup)
+                print_mce(backup);
+        panic(msg);
+} 
+static int mce_available(struct cpuinfo_x86 *c)
+{
+        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
+static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
+{
+        if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
+                m->rip = regs->rip;
+                m->cs = regs->cs;
+        } else {
+                m->rip = 0;
+                m->cs = 0;
+        }
+        if (rip_msr) {
+                /* Assume the RIP in the MSR is exact. Is this true? */
+                m->mcgstatus |= MCG_STATUS_EIPV;
+                rdmsrl(rip_msr, m->rip);
+                m->cs = 0;
+        }
+}
+/* 
+ * The actual machine check handler
+ */
+void do_machine_check(struct pt_regs * regs, long error_code)
+{
+        struct mce m, panicm;
+        u64 mcestart = 0;
+        int i;
+        int panicm_found = 0;
+        /*
+         * If no_way_out gets set, there is no safe way to recover from this
+         * MCE.  If tolerant is cranked up, we'll try anyway.
+         */
+        int no_way_out = 0;
+        /*
+         * If kill_it gets set, there might be a way to recover from this
+         * error.
+         */
+        int kill_it = 0;
+        atomic_inc(&mce_entry);
+        if (regs)
+                notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
+        if (!banks)
+                goto out2;
+        memset(&m, 0, sizeof(struct mce));
+        m.cpu = smp_processor_id();
+        rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+        /* if the restart IP is not valid, we're done for */
+        if (!(m.mcgstatus & MCG_STATUS_RIPV))
+                no_way_out = 1;
+        
+        rdtscll(mcestart);
+        barrier();
+        for (i = 0; i < banks; i++) {
+                if (!bank[i])
+                        continue;
+                
+                m.misc = 0; 
+                m.addr = 0;
+                m.bank = i;
+                m.tsc = 0;
+                rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+                if ((m.status & MCI_STATUS_VAL) == 0)
+                        continue;
+                if (m.status & MCI_STATUS_EN) {
+                        /* if PCC was set, there's no way out */
+                        no_way_out |= !!(m.status & MCI_STATUS_PCC);
+                        /*
+                         * If this error was uncorrectable and there was
+                         * an overflow, we're in trouble.  If no overflow,
+                         * we might get away with just killing a task.
+                         */
+                        if (m.status & MCI_STATUS_UC) {
+                                if (tolerant < 1 || m.status & MCI_STATUS_OVER)
+                                        no_way_out = 1;
+                                kill_it = 1;
+                        }
+                }
+                if (m.status & MCI_STATUS_MISCV)
+                        rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+                if (m.status & MCI_STATUS_ADDRV)
+                        rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+                mce_get_rip(&m, regs);
+                if (error_code >= 0)
+                        rdtscll(m.tsc);
+                if (error_code != -2)
+                        mce_log(&m);
+                /* Did this bank cause the exception? */
+                /* Assume that the bank with uncorrectable errors did it,
+                   and that there is only a single one. */
+                if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
+                        panicm = m;
+                        panicm_found = 1;
+                }
+                add_taint(TAINT_MACHINE_CHECK);
+        }
+        /* Never do anything final in the polling timer */
+        if (!regs)
+                goto out;
+        /* If we didn't find an uncorrectable error, pick
+           the last one (shouldn't happen, just being safe). */
+        if (!panicm_found)
+                panicm = m;
+        /*
+         * If we have decided that we just CAN'T continue, and the user
+         *  has not set tolerant to an insane level, give up and die.
+         */
+        if (no_way_out && tolerant < 3)
+                mce_panic("Machine check", &panicm, mcestart);
+        /*
+         * If the error seems to be unrecoverable, something should be
+         * done.  Try to kill as little as possible.  If we can kill just
+         * one task, do that.  If the user has set the tolerance very
+         * high, don't try to do anything at all.
+         */
+        if (kill_it && tolerant < 3) {
+                int user_space = 0;
+                /*
+                 * If the EIPV bit is set, it means the saved IP is the
+                 * instruction which caused the MCE.
+                 */
+                if (m.mcgstatus & MCG_STATUS_EIPV)
+                        user_space = panicm.rip && (panicm.cs & 3);
+                /*
+                 * If we know that the error was in user space, send a
+                 * SIGBUS.  Otherwise, panic if tolerance is low.
+                 *
+                 * do_exit() takes an awful lot of locks and has a slight
+                 * risk of deadlocking.
+                 */
+                if (user_space) {
+                        do_exit(SIGBUS);
+                } else if (panic_on_oops || tolerant < 2) {
+                        mce_panic("Uncorrected machine check",
+                                &panicm, mcestart);
+                }
+        }
+        /* notify userspace ASAP */
+        set_thread_flag(TIF_MCE_NOTIFY);
+ out:
+        /* the last thing we do is clear state */
+        for (i = 0; i < banks; i++)
+                wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+        wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ out2:
+        atomic_dec(&mce_entry);
+}
+#ifdef CONFIG_X86_MCE_INTEL
+/***
+ * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
+ * @cpu: The CPU on which the event occured.
+ * @status: Event status information
+ *
+ * This function should be called by the thermal interrupt after the
+ * event has been processed and the decision was made to log the event
+ * further.
+ *
+ * The status parameter will be saved to the 'status' field of 'struct mce'
+ * and historically has been the register value of the
+ * MSR_IA32_THERMAL_STATUS (Intel) msr.
+ */
+void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+{
+        struct mce m;
+        memset(&m, 0, sizeof(m));
+        m.cpu = cpu;
+        m.bank = MCE_THERMAL_BANK;
+        m.status = status;
+        rdtscll(m.tsc);
+        mce_log(&m);
+}
+#endif /* CONFIG_X86_MCE_INTEL */
+/*
+ * Periodic polling timer for "silent" machine check errors.  If the
+ * poller finds an MCE, poll 2x faster.  When the poller finds no more
+ * errors, poll 2x slower (up to check_interval seconds).
+ */
+static int check_interval = 5 * 60; /* 5 minutes */
+static int next_interval; /* in jiffies */
+static void mcheck_timer(struct work_struct *work);
+static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+static void mcheck_check_cpu(void *info)
+{
+        if (mce_available(&current_cpu_data))
+                do_machine_check(NULL, 0);
+}
+static void mcheck_timer(struct work_struct *work)
+{
+        on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
+        /*
+         * Alert userspace if needed.  If we logged an MCE, reduce the
+         * polling interval, otherwise increase the polling interval.
+         */
+        if (mce_notify_user()) {
+                next_interval = max(next_interval/2, HZ/100);
+        } else {
+                next_interval = min(next_interval*2,
+                                (int)round_jiffies_relative(check_interval*HZ));
+        }
+        schedule_delayed_work(&mcheck_work, next_interval);
+}
+/*
+ * This is only called from process context.  This is where we do
+ * anything we need to alert userspace about new MCEs.  This is called
+ * directly from the poller and also from entry.S and idle, thanks to
+ * TIF_MCE_NOTIFY.
+ */
+int mce_notify_user(void)
+{
+        clear_thread_flag(TIF_MCE_NOTIFY);
+        if (test_and_clear_bit(0, &notify_user)) {
+                static unsigned long last_print;
+                unsigned long now = jiffies;
+                wake_up_interruptible(&mce_wait);
+                if (trigger[0])
+                        call_usermodehelper(trigger, trigger_argv, NULL,
+                                                UMH_NO_WAIT);
+                if (time_after_eq(now, last_print + (check_interval*HZ))) {
+                        last_print = now;
+                        printk(KERN_INFO "Machine check events logged\n");
+                }
+                return 1;
+        }
+        return 0;
+}
+/* see if the idle task needs to notify userspace */
+static int
+mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
+{
+        /* IDLE_END should be safe - interrupts are back on */
+        if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
+                mce_notify_user();
+        return NOTIFY_OK;
+}
+static struct notifier_block mce_idle_notifier = {
+        .notifier_call = mce_idle_callback,
+};
+static __init int periodic_mcheck_init(void)
+{ 
+        next_interval = check_interval * HZ;
+        if (next_interval)
+                schedule_delayed_work(&mcheck_work,
+                                      round_jiffies_relative(next_interval));
+        idle_notifier_register(&mce_idle_notifier);
+        return 0;
+} 
+__initcall(periodic_mcheck_init);
+/* 
+ * Initialize Machine Checks for a CPU.
+ */
+static void mce_init(void *dummy)
+{
+        u64 cap;
+        int i;
+        rdmsrl(MSR_IA32_MCG_CAP, cap);
+        banks = cap & 0xff;
+        if (banks > NR_BANKS) { 
+                printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
+                banks = NR_BANKS; 
+        }
+        /* Use accurate RIP reporting if available. */
+        if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
+                rip_msr = MSR_IA32_MCG_EIP;
+        /* Log the machine checks left over from the previous reset.
+           This also clears all registers */
+        do_machine_check(NULL, mce_bootlog ? -1 : -2);
+        set_in_cr4(X86_CR4_MCE);
+        if (cap & MCG_CTL_P)
+                wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+        for (i = 0; i < banks; i++) {
+                wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+                wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+        }       
+}
+/* Add per CPU specific workarounds here */
+static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+{ 
+        /* This should be disabled by the BIOS, but isn't always */
+        if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
+                /* disable GART TBL walk error reporting, which trips off 
+                   incorrectly with the IOMMU & 3ware & Cerberus. */
+                clear_bit(10, &bank[4]);
+                /* Lots of broken BIOS around that don't clear them
+                   by default and leave crap in there. Don't log. */
+                mce_bootlog = 0;
+        }
+}                       
+static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
+{
+        switch (c->x86_vendor) {
+        case X86_VENDOR_INTEL:
+                mce_intel_feature_init(c);
+                break;
+        case X86_VENDOR_AMD:
+                mce_amd_feature_init(c);
+                break;
+        default:
+                break;
+        }
+}
+/* 
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off. 
+ */
+void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
+{
+        static cpumask_t mce_cpus = CPU_MASK_NONE;
+        mce_cpu_quirks(c); 
+        if (mce_dont_init ||
+            cpu_test_and_set(smp_processor_id(), mce_cpus) ||
+            !mce_available(c))
+                return;
+        mce_init(NULL);
+        mce_cpu_features(c);
+}
+/*
+ * Character device to read and clear the MCE log.
+ */
+static DEFINE_SPINLOCK(mce_state_lock);
+static int open_count;  /* #times opened */
+static int open_exclu;  /* already open exclusive? */
+static int mce_open(struct inode *inode, struct file *file)
+{
+        spin_lock(&mce_state_lock);
+        if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
+                spin_unlock(&mce_state_lock);
+                return -EBUSY;
+        }
+        if (file->f_flags & O_EXCL)
+                open_exclu = 1;
+        open_count++;
+        spin_unlock(&mce_state_lock);
+        return nonseekable_open(inode, file);
+}
+static int mce_release(struct inode *inode, struct file *file)
+{
+        spin_lock(&mce_state_lock);
+        open_count--;
+        open_exclu = 0;
+        spin_unlock(&mce_state_lock);
+        return 0;
+}
+static void collect_tscs(void *data) 
+{ 
+        unsigned long *cpu_tsc = (unsigned long *)data;
+        rdtscll(cpu_tsc[smp_processor_id()]);
+} 
+static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
+{
+        unsigned long *cpu_tsc;
+        static DECLARE_MUTEX(mce_read_sem);
+        unsigned next;
+        char __user *buf = ubuf;
+        int i, err;
+        cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
+        if (!cpu_tsc)
+                return -ENOMEM;
+        down(&mce_read_sem); 
+        next = rcu_dereference(mcelog.next);
+        /* Only supports full reads right now */
+        if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 
+                up(&mce_read_sem);
+                kfree(cpu_tsc);
+                return -EINVAL;
+        }
+        err = 0;
+        for (i = 0; i < next; i++) {            
+                unsigned long start = jiffies;
+                while (!mcelog.entry[i].finished) {
+                        if (time_after_eq(jiffies, start + 2)) {
+                                memset(mcelog.entry + i,0, sizeof(struct mce));
+                                goto timeout;
+                        }
+                        cpu_relax();
+                }
+                smp_rmb();
+                err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
+                buf += sizeof(struct mce); 
+ timeout:
+                ;
+        } 
+        memset(mcelog.entry, 0, next * sizeof(struct mce));
+        mcelog.next = 0;
+        synchronize_sched();
+        /* Collect entries that were still getting written before the synchronize. */
+        on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
+        for (i = next; i < MCE_LOG_LEN; i++) { 
+                if (mcelog.entry[i].finished && 
+                    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {  
+                        err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
+                        smp_rmb();
+                        buf += sizeof(struct mce);
+                        memset(&mcelog.entry[i], 0, sizeof(struct mce));
+                }
+        }       
+        up(&mce_read_sem);
+        kfree(cpu_tsc);
+        return err ? -EFAULT : buf - ubuf; 
+}
+static unsigned int mce_poll(struct file *file, poll_table *wait)
+{
+        poll_wait(file, &mce_wait, wait);
+        if (rcu_dereference(mcelog.next))
+                return POLLIN | POLLRDNORM;
+        return 0;
+}
+static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
+{
+        int __user *p = (int __user *)arg;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM; 
+        switch (cmd) {
+        case MCE_GET_RECORD_LEN: 
+                return put_user(sizeof(struct mce), p);
+        case MCE_GET_LOG_LEN:
+                return put_user(MCE_LOG_LEN, p);                
+        case MCE_GETCLEAR_FLAGS: {
+                unsigned flags;
+                do { 
+                        flags = mcelog.flags;
+                } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 
+                return put_user(flags, p); 
+        }
+        default:
+                return -ENOTTY; 
+        } 
+}
+static const struct file_operations mce_chrdev_ops = {
+        .open = mce_open,
+        .release = mce_release,
+        .read = mce_read,
+        .poll = mce_poll,
+        .ioctl = mce_ioctl,
+};
+static struct miscdevice mce_log_device = {
+        MISC_MCELOG_MINOR,
+        "mcelog",
+        &mce_chrdev_ops,
+};
+static unsigned long old_cr4 __initdata;
+void __init stop_mce(void)
+{
+        old_cr4 = read_cr4();
+        clear_in_cr4(X86_CR4_MCE);
+}
+void __init restart_mce(void)
+{
+        if (old_cr4 & X86_CR4_MCE)
+                set_in_cr4(X86_CR4_MCE);
+}
+/* 
+ * Old style boot options parsing. Only for compatibility. 
+ */
+static int __init mcheck_disable(char *str)
+{
+        mce_dont_init = 1;
+        return 1;
+}
+/* mce=off disables machine check. Note you can reenable it later
+   using sysfs.
+   mce=TOLERANCELEVEL (number, see above)
+   mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+   mce=nobootlog Don't log MCEs from before booting. */
+static int __init mcheck_enable(char *str)
+{
+        if (*str == '=')
+                str++;
+        if (!strcmp(str, "off"))
+                mce_dont_init = 1;
+        else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
+                mce_bootlog = str[0] == 'b';
+        else if (isdigit(str[0]))
+                get_option(&str, &tolerant);
+        else
+                printk("mce= argument %s ignored. Please use /sys", str); 
+        return 1;
+}
+__setup("nomce", mcheck_disable);
+__setup("mce", mcheck_enable);
+/* 
+ * Sysfs support
+ */ 
+/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
+   Only one CPU is active at this time, the others get readded later using
+   CPU hotplug. */
+static int mce_resume(struct sys_device *dev)
+{
+        mce_init(NULL);
+        return 0;
+}
+/* Reinit MCEs after user configuration changes */
+static void mce_restart(void) 
+{ 
+        if (next_interval)
+                cancel_delayed_work(&mcheck_work);
+        /* Timer race is harmless here */
+        on_each_cpu(mce_init, NULL, 1, 1);       
+        next_interval = check_interval * HZ;
+        if (next_interval)
+                schedule_delayed_work(&mcheck_work,
+                                      round_jiffies_relative(next_interval));
+}
+static struct sysdev_class mce_sysclass = {
+        .resume = mce_resume,
+        set_kset_name("machinecheck"),
+};
+DEFINE_PER_CPU(struct sys_device, device_mce);
+/* Why are there no generic functions for this? */
+#define ACCESSOR(name, var, start) \
+        static ssize_t show_ ## name(struct sys_device *s, char *buf) {                    \
+                return sprintf(buf, "%lx\n", (unsigned long)var);                  \
+        }                                                                          \
+        static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
+                char *end;                                                         \
+                unsigned long new = simple_strtoul(buf, &end, 0);                  \
+                if (end == buf) return -EINVAL;                                    \
+                var = new;                                                         \
+                start;                                                             \
+                return end-buf;                                                    \
+        }                                                                          \
+        static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
+/* TBD should generate these dynamically based on number of available banks */
+ACCESSOR(bank0ctl,bank[0],mce_restart())
+ACCESSOR(bank1ctl,bank[1],mce_restart())
+ACCESSOR(bank2ctl,bank[2],mce_restart())
+ACCESSOR(bank3ctl,bank[3],mce_restart())
+ACCESSOR(bank4ctl,bank[4],mce_restart())
+ACCESSOR(bank5ctl,bank[5],mce_restart())
+static ssize_t show_trigger(struct sys_device *s, char *buf)
+{
+        strcpy(buf, trigger);
+        strcat(buf, "\n");
+        return strlen(trigger) + 1;
+}
+static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
+{
+        char *p;
+        int len;
+        strncpy(trigger, buf, sizeof(trigger));
+        trigger[sizeof(trigger)-1] = 0;
+        len = strlen(trigger);
+        p = strchr(trigger, '\n');
+        if (*p) *p = 0;
+        return len;
+}
+static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
+ACCESSOR(tolerant,tolerant,)
+ACCESSOR(check_interval,check_interval,mce_restart())
+static struct sysdev_attribute *mce_attributes[] = {
+        &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
+        &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
+        &attr_tolerant, &attr_check_interval, &attr_trigger,
+        NULL
+};
+/* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
+static __cpuinit int mce_create_device(unsigned int cpu)
+{
+        int err;
+        int i;
+        if (!mce_available(&cpu_data[cpu]))
+                return -EIO;
+        per_cpu(device_mce,cpu).id = cpu;
+        per_cpu(device_mce,cpu).cls = &mce_sysclass;
+        err = sysdev_register(&per_cpu(device_mce,cpu));
+        if (!err) {
+                for (i = 0; mce_attributes[i]; i++)
+                        sysdev_create_file(&per_cpu(device_mce,cpu),
+                                mce_attributes[i]);
+        }
+        return err;
+}
+static void mce_remove_device(unsigned int cpu)
+{
+        int i;
+        for (i = 0; mce_attributes[i]; i++)
+                sysdev_remove_file(&per_cpu(device_mce,cpu),
+                        mce_attributes[i]);
+        sysdev_unregister(&per_cpu(device_mce,cpu));
+        memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
+}
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static int
+mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+        unsigned int cpu = (unsigned long)hcpu;
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                mce_create_device(cpu);
+                break;
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                mce_remove_device(cpu);
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block mce_cpu_notifier = {
+        .notifier_call = mce_cpu_callback,
+};
+static __init int mce_init_device(void)
+{
+        int err;
+        int i = 0;
+        if (!mce_available(&boot_cpu_data))
+                return -EIO;
+        err = sysdev_class_register(&mce_sysclass);
+        for_each_online_cpu(i) {
+                mce_create_device(i);
+        }
+        register_hotcpu_notifier(&mce_cpu_notifier);
+        misc_register(&mce_log_device);
+        return err;
+}
+device_initcall(mce_init_device);
diff --git a/arch/x86/kernel/mce_amd_64.c b/arch/x86/kernel/mce_amd_64.c
new file mode 100644
index 000000000000..2f8a7f18b0fe
--- /dev/null
+++ b/arch/x86/kernel/mce_amd_64.c
@@ -0,0 +1,689 @@
+/*
+ *  (c) 2005, 2006 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ *
+ *  Written by Jacob Shin - AMD, Inc.
+ *
+ *  Support : jacob.shin@amd.com
+ *
+ *  April 2006
+ *     - added support for AMD Family 0x10 processors
+ *
+ *  All MC4_MISCi registers are shared between multi-cores
+ */
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kobject.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/sysdev.h>
+#include <linux/sysfs.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/percpu.h>
+#include <asm/idle.h>
+#define PFX               "mce_threshold: "
+#define VERSION           "version 1.1.1"
+#define NR_BANKS          6
+#define NR_BLOCKS         9
+#define THRESHOLD_MAX     0xFFF
+#define INT_TYPE_APIC     0x00020000
+#define MASK_VALID_HI     0x80000000
+#define MASK_CNTP_HI      0x40000000
+#define MASK_LOCKED_HI    0x20000000
+#define MASK_LVTOFF_HI    0x00F00000
+#define MASK_COUNT_EN_HI  0x00080000
+#define MASK_INT_TYPE_HI  0x00060000
+#define MASK_OVERFLOW_HI  0x00010000
+#define MASK_ERR_COUNT_HI 0x00000FFF
+#define MASK_BLKPTR_LO    0xFF000000
+#define MCG_XBLK_ADDR     0xC0000400
+struct threshold_block {
+        unsigned int block;
+        unsigned int bank;
+        unsigned int cpu;
+        u32 address;
+        u16 interrupt_enable;
+        u16 threshold_limit;
+        struct kobject kobj;
+        struct list_head miscj;
+};
+/* defaults used early on boot */
+static struct threshold_block threshold_defaults = {
+        .interrupt_enable = 0,
+        .threshold_limit = THRESHOLD_MAX,
+};
+struct threshold_bank {
+        struct kobject kobj;
+        struct threshold_block *blocks;
+        cpumask_t cpus;
+};
+static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
+#ifdef CONFIG_SMP
+static unsigned char shared_bank[NR_BANKS] = {
+        0, 0, 0, 0, 1
+};
+#endif
+static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
+/*
+ * CPU Initialization
+ */
+/* must be called with correct cpu affinity */
+static void threshold_restart_bank(struct threshold_block *b,
+                                   int reset, u16 old_limit)
+{
+        u32 mci_misc_hi, mci_misc_lo;
+        rdmsr(b->address, mci_misc_lo, mci_misc_hi);
+        if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+                reset = 1;      /* limit cannot be lower than err count */
+        if (reset) {            /* reset err count and overflow bit */
+                mci_misc_hi =
+                    (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+                    (THRESHOLD_MAX - b->threshold_limit);
+        } else if (old_limit) { /* change limit w/o reset */
+                int new_count = (mci_misc_hi & THRESHOLD_MAX) +
+                    (old_limit - b->threshold_limit);
+                mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
+                    (new_count & THRESHOLD_MAX);
+        }
+        b->interrupt_enable ?
+            (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
+            (mci_misc_hi &= ~MASK_INT_TYPE_HI);
+        mci_misc_hi |= MASK_COUNT_EN_HI;
+        wrmsr(b->address, mci_misc_lo, mci_misc_hi);
+}
+/* cpu init entry point, called from mce.c with preempt off */
+void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
+{
+        unsigned int bank, block;
+        unsigned int cpu = smp_processor_id();
+        u32 low = 0, high = 0, address = 0;
+        for (bank = 0; bank < NR_BANKS; ++bank) {
+                for (block = 0; block < NR_BLOCKS; ++block) {
+                        if (block == 0)
+                                address = MSR_IA32_MC0_MISC + bank * 4;
+                        else if (block == 1) {
+                                address = (low & MASK_BLKPTR_LO) >> 21;
+                                if (!address)
+                                        break;
+                                address += MCG_XBLK_ADDR;
+                        }
+                        else
+                                ++address;
+                        if (rdmsr_safe(address, &low, &high))
+                                break;
+                        if (!(high & MASK_VALID_HI)) {
+                                if (block)
+                                        continue;
+                                else
+                                        break;
+                        }
+                        if (!(high & MASK_CNTP_HI)  ||
+                             (high & MASK_LOCKED_HI))
+                                continue;
+                        if (!block)
+                                per_cpu(bank_map, cpu) |= (1 << bank);
+#ifdef CONFIG_SMP
+                        if (shared_bank[bank] && c->cpu_core_id)
+                                break;
+#endif
+                        high &= ~MASK_LVTOFF_HI;
+                        high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20;
+                        wrmsr(address, low, high);
+                        setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
+                                                THRESHOLD_APIC_VECTOR,
+                                                K8_APIC_EXT_INT_MSG_FIX, 0);
+                        threshold_defaults.address = address;
+                        threshold_restart_bank(&threshold_defaults, 0, 0);
+                }
+        }
+}
+/*
+ * APIC Interrupt Handler
+ */
+/*
+ * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
+ * the interrupt goes off when error_count reaches threshold_limit.
+ * the handler will simply log mcelog w/ software defined bank number.
+ */
+asmlinkage void mce_threshold_interrupt(void)
+{
+        unsigned int bank, block;
+        struct mce m;
+        u32 low = 0, high = 0, address = 0;
+        ack_APIC_irq();
+        exit_idle();
+        irq_enter();
+        memset(&m, 0, sizeof(m));
+        rdtscll(m.tsc);
+        m.cpu = smp_processor_id();
+        /* assume first bank caused it */
+        for (bank = 0; bank < NR_BANKS; ++bank) {
+                if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
+                        continue;
+                for (block = 0; block < NR_BLOCKS; ++block) {
+                        if (block == 0)
+                                address = MSR_IA32_MC0_MISC + bank * 4;
+                        else if (block == 1) {
+                                address = (low & MASK_BLKPTR_LO) >> 21;
+                                if (!address)
+                                        break;
+                                address += MCG_XBLK_ADDR;
+                        }
+                        else
+                                ++address;
+                        if (rdmsr_safe(address, &low, &high))
+                                break;
+                        if (!(high & MASK_VALID_HI)) {
+                                if (block)
+                                        continue;
+                                else
+                                        break;
+                        }
+                        if (!(high & MASK_CNTP_HI)  ||
+                             (high & MASK_LOCKED_HI))
+                                continue;
+                        /* Log the machine check that caused the threshold
+                           event. */
+                        do_machine_check(NULL, 0);
+                        if (high & MASK_OVERFLOW_HI) {
+                                rdmsrl(address, m.misc);
+                                rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
+                                       m.status);
+                                m.bank = K8_MCE_THRESHOLD_BASE
+                                       + bank * NR_BLOCKS
+                                       + block;
+                                mce_log(&m);
+                                goto out;
+                        }
+                }
+        }
+out:
+        irq_exit();
+}
+/*
+ * Sysfs Interface
+ */
+struct threshold_attr {
+        struct attribute attr;
+        ssize_t(*show) (struct threshold_block *, char *);
+        ssize_t(*store) (struct threshold_block *, const char *, size_t count);
+};
+static cpumask_t affinity_set(unsigned int cpu)
+{
+        cpumask_t oldmask = current->cpus_allowed;
+        cpumask_t newmask = CPU_MASK_NONE;
+        cpu_set(cpu, newmask);
+        set_cpus_allowed(current, newmask);
+        return oldmask;
+}
+static void affinity_restore(cpumask_t oldmask)
+{
+        set_cpus_allowed(current, oldmask);
+}
+#define SHOW_FIELDS(name)                                           \
+static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
+{                                                                   \
+        return sprintf(buf, "%lx\n", (unsigned long) b->name);      \
+}
+SHOW_FIELDS(interrupt_enable)
+SHOW_FIELDS(threshold_limit)
+static ssize_t store_interrupt_enable(struct threshold_block *b,
+                                      const char *buf, size_t count)
+{
+        char *end;
+        cpumask_t oldmask;
+        unsigned long new = simple_strtoul(buf, &end, 0);
+        if (end == buf)
+                return -EINVAL;
+        b->interrupt_enable = !!new;
+        oldmask = affinity_set(b->cpu);
+        threshold_restart_bank(b, 0, 0);
+        affinity_restore(oldmask);
+        return end - buf;
+}
+static ssize_t store_threshold_limit(struct threshold_block *b,
+                                     const char *buf, size_t count)
+{
+        char *end;
+        cpumask_t oldmask;
+        u16 old;
+        unsigned long new = simple_strtoul(buf, &end, 0);
+        if (end == buf)
+                return -EINVAL;
+        if (new > THRESHOLD_MAX)
+                new = THRESHOLD_MAX;
+        if (new < 1)
+                new = 1;
+        old = b->threshold_limit;
+        b->threshold_limit = new;
+        oldmask = affinity_set(b->cpu);
+        threshold_restart_bank(b, 0, old);
+        affinity_restore(oldmask);
+        return end - buf;
+}
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
+{
+        u32 high, low;
+        cpumask_t oldmask;
+        oldmask = affinity_set(b->cpu);
+        rdmsr(b->address, low, high);
+        affinity_restore(oldmask);
+        return sprintf(buf, "%x\n",
+                       (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
+}
+static ssize_t store_error_count(struct threshold_block *b,
+                                 const char *buf, size_t count)
+{
+        cpumask_t oldmask;
+        oldmask = affinity_set(b->cpu);
+        threshold_restart_bank(b, 1, 0);
+        affinity_restore(oldmask);
+        return 1;
+}
+#define THRESHOLD_ATTR(_name,_mode,_show,_store) {            \
+        .attr = {.name = __stringify(_name), .mode = _mode }, \
+        .show = _show,                                        \
+        .store = _store,                                      \
+};
+#define RW_ATTR(name)                                           \
+static struct threshold_attr name =                             \
+        THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
+RW_ATTR(interrupt_enable);
+RW_ATTR(threshold_limit);
+RW_ATTR(error_count);
+static struct attribute *default_attrs[] = {
+        &interrupt_enable.attr,
+        &threshold_limit.attr,
+        &error_count.attr,
+        NULL
+};
+#define to_block(k) container_of(k, struct threshold_block, kobj)
+#define to_attr(a) container_of(a, struct threshold_attr, attr)
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+        struct threshold_block *b = to_block(kobj);
+        struct threshold_attr *a = to_attr(attr);
+        ssize_t ret;
+        ret = a->show ? a->show(b, buf) : -EIO;
+        return ret;
+}
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+                     const char *buf, size_t count)
+{
+        struct threshold_block *b = to_block(kobj);
+        struct threshold_attr *a = to_attr(attr);
+        ssize_t ret;
+        ret = a->store ? a->store(b, buf, count) : -EIO;
+        return ret;
+}
+static struct sysfs_ops threshold_ops = {
+        .show = show,
+        .store = store,
+};
+static struct kobj_type threshold_ktype = {
+        .sysfs_ops = &threshold_ops,
+        .default_attrs = default_attrs,
+};
+static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
+                                               unsigned int bank,
+                                               unsigned int block,
+                                               u32 address)
+{
+        int err;
+        u32 low, high;
+        struct threshold_block *b = NULL;
+        if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
+                return 0;
+        if (rdmsr_safe(address, &low, &high))
+                return 0;
+        if (!(high & MASK_VALID_HI)) {
+                if (block)
+                        goto recurse;
+                else
+                        return 0;
+        }
+        if (!(high & MASK_CNTP_HI)  ||
+             (high & MASK_LOCKED_HI))
+                goto recurse;
+        b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
+        if (!b)
+                return -ENOMEM;
+        b->block = block;
+        b->bank = bank;
+        b->cpu = cpu;
+        b->address = address;
+        b->interrupt_enable = 0;
+        b->threshold_limit = THRESHOLD_MAX;
+        INIT_LIST_HEAD(&b->miscj);
+        if (per_cpu(threshold_banks, cpu)[bank]->blocks)
+                list_add(&b->miscj,
+                         &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
+        else
+                per_cpu(threshold_banks, cpu)[bank]->blocks = b;
+        kobject_set_name(&b->kobj, "misc%i", block);
+        b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj;
+        b->kobj.ktype = &threshold_ktype;
+        err = kobject_register(&b->kobj);
+        if (err)
+                goto out_free;
+recurse:
+        if (!block) {
+                address = (low & MASK_BLKPTR_LO) >> 21;
+                if (!address)
+                        return 0;
+                address += MCG_XBLK_ADDR;
+        } else
+                ++address;
+        err = allocate_threshold_blocks(cpu, bank, ++block, address);
+        if (err)
+                goto out_free;
+        return err;
+out_free:
+        if (b) {
+                kobject_unregister(&b->kobj);
+                kfree(b);
+        }
+        return err;
+}
+/* symlinks sibling shared banks to first core.  first core owns dir/files. */
+static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
+{
+        int i, err = 0;
+        struct threshold_bank *b = NULL;
+        cpumask_t oldmask = CPU_MASK_NONE;
+        char name[32];
+        sprintf(name, "threshold_bank%i", bank);
+#ifdef CONFIG_SMP
+        if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) {   /* symlink */
+                i = first_cpu(cpu_core_map[cpu]);
+                /* first core not up yet */
+                if (cpu_data[i].cpu_core_id)
+                        goto out;
+                /* already linked */
+                if (per_cpu(threshold_banks, cpu)[bank])
+                        goto out;
+                b = per_cpu(threshold_banks, i)[bank];
+                if (!b)
+                        goto out;
+                err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
+                                        &b->kobj, name);
+                if (err)
+                        goto out;
+                b->cpus = cpu_core_map[cpu];
+                per_cpu(threshold_banks, cpu)[bank] = b;
+                goto out;
+        }
+#endif
+        b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+        if (!b) {
+                err = -ENOMEM;
+                goto out;
+        }
+        kobject_set_name(&b->kobj, "threshold_bank%i", bank);
+        b->kobj.parent = &per_cpu(device_mce, cpu).kobj;
+#ifndef CONFIG_SMP
+        b->cpus = CPU_MASK_ALL;
+#else
+        b->cpus = cpu_core_map[cpu];
+#endif
+        err = kobject_register(&b->kobj);
+        if (err)
+                goto out_free;
+        per_cpu(threshold_banks, cpu)[bank] = b;
+        oldmask = affinity_set(cpu);
+        err = allocate_threshold_blocks(cpu, bank, 0,
+                                        MSR_IA32_MC0_MISC + bank * 4);
+        affinity_restore(oldmask);
+        if (err)
+                goto out_free;
+        for_each_cpu_mask(i, b->cpus) {
+                if (i == cpu)
+                        continue;
+                err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
+                                        &b->kobj, name);
+                if (err)
+                        goto out;
+                per_cpu(threshold_banks, i)[bank] = b;
+        }
+        goto out;
+out_free:
+        per_cpu(threshold_banks, cpu)[bank] = NULL;
+        kfree(b);
+out:
+        return err;
+}
+/* create dir/files for all valid threshold banks */
+static __cpuinit int threshold_create_device(unsigned int cpu)
+{
+        unsigned int bank;
+        int err = 0;
+        for (bank = 0; bank < NR_BANKS; ++bank) {
+                if (!(per_cpu(bank_map, cpu) & 1 << bank))
+                        continue;
+                err = threshold_create_bank(cpu, bank);
+                if (err)
+                        goto out;
+        }
+out:
+        return err;
+}
+/*
+ * let's be hotplug friendly.
+ * in case of multiple core processors, the first core always takes ownership
+ *   of shared sysfs dir/files, and rest of the cores will be symlinked to it.
+ */
+static void deallocate_threshold_block(unsigned int cpu,
+                                                 unsigned int bank)
+{
+        struct threshold_block *pos = NULL;
+        struct threshold_block *tmp = NULL;
+        struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
+        if (!head)
+                return;
+        list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
+                kobject_unregister(&pos->kobj);
+                list_del(&pos->miscj);
+                kfree(pos);
+        }
+        kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
+        per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
+}
+static void threshold_remove_bank(unsigned int cpu, int bank)
+{
+        int i = 0;
+        struct threshold_bank *b;
+        char name[32];
+        b = per_cpu(threshold_banks, cpu)[bank];
+        if (!b)
+                return;
+        if (!b->blocks)
+                goto free_out;
+        sprintf(name, "threshold_bank%i", bank);
+#ifdef CONFIG_SMP
+        /* sibling symlink */
+        if (shared_bank[bank] && b->blocks->cpu != cpu) {
+                sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
+                per_cpu(threshold_banks, cpu)[bank] = NULL;
+                return;
+        }
+#endif
+        /* remove all sibling symlinks before unregistering */
+        for_each_cpu_mask(i, b->cpus) {
+                if (i == cpu)
+                        continue;
+                sysfs_remove_link(&per_cpu(device_mce, i).kobj, name);
+                per_cpu(threshold_banks, i)[bank] = NULL;
+        }
+        deallocate_threshold_block(cpu, bank);
+free_out:
+        kobject_unregister(&b->kobj);
+        kfree(b);
+        per_cpu(threshold_banks, cpu)[bank] = NULL;
+}
+static void threshold_remove_device(unsigned int cpu)
+{
+        unsigned int bank;
+        for (bank = 0; bank < NR_BANKS; ++bank) {
+                if (!(per_cpu(bank_map, cpu) & 1 << bank))
+                        continue;
+                threshold_remove_bank(cpu, bank);
+        }
+}
+/* get notified when a cpu comes on/off */
+static int threshold_cpu_callback(struct notifier_block *nfb,
+                                            unsigned long action, void *hcpu)
+{
+        /* cpu was unsigned int to begin with */
+        unsigned int cpu = (unsigned long)hcpu;
+        if (cpu >= NR_CPUS)
+                goto out;
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                threshold_create_device(cpu);
+                break;
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                threshold_remove_device(cpu);
+                break;
+        default:
+                break;
+        }
+      out:
+        return NOTIFY_OK;
+}
+static struct notifier_block threshold_cpu_notifier = {
+        .notifier_call = threshold_cpu_callback,
+};
+static __init int threshold_init_device(void)
+{
+        unsigned lcpu = 0;
+        /* to hit CPUs online before the notifier is up */
+        for_each_online_cpu(lcpu) {
+                int err = threshold_create_device(lcpu);
+                if (err)
+                        return err;
+        }
+        register_hotcpu_notifier(&threshold_cpu_notifier);
+        return 0;
+}
+device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/mce_intel_64.c b/arch/x86/kernel/mce_intel_64.c
new file mode 100644
index 000000000000..6551505d8a2c
--- /dev/null
+++ b/arch/x86/kernel/mce_intel_64.c
@@ -0,0 +1,89 @@
+/*
+ * Intel specific MCE features.
+ * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ */
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+#include <asm/hw_irq.h>
+#include <asm/idle.h>
+#include <asm/therm_throt.h>
+asmlinkage void smp_thermal_interrupt(void)
+{
+        __u64 msr_val;
+        ack_APIC_irq();
+        exit_idle();
+        irq_enter();
+        rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+        if (therm_throt_process(msr_val & 1))
+                mce_log_therm_throt_event(smp_processor_id(), msr_val);
+        irq_exit();
+}
+static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
+{
+        u32 l, h;
+        int tm2 = 0;
+        unsigned int cpu = smp_processor_id();
+        if (!cpu_has(c, X86_FEATURE_ACPI))
+                return;
+        if (!cpu_has(c, X86_FEATURE_ACC))
+                return;
+        /* first check if TM1 is already enabled by the BIOS, in which
+         * case there might be some SMM goo which handles it, so we can't even
+         * put a handler since it might be delivered via SMI already.
+         */
+        rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+        h = apic_read(APIC_LVTTHMR);
+        if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
+                printk(KERN_DEBUG
+                       "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+                return;
+        }
+        if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
+                tm2 = 1;
+        if (h & APIC_VECTOR_MASK) {
+                printk(KERN_DEBUG
+                       "CPU%d: Thermal LVT vector (%#x) already "
+                       "installed\n", cpu, (h & APIC_VECTOR_MASK));
+                return;
+        }
+        h = THERMAL_APIC_VECTOR;
+        h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
+        apic_write(APIC_LVTTHMR, h);
+        rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+        wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
+        rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+        wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
+        l = apic_read(APIC_LVTTHMR);
+        apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+        printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
+                cpu, tm2 ? "TM2" : "TM1");
+        /* enable thermal throttle processing */
+        atomic_set(&therm_throt_en, 1);
+        return;
+}
+void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+        intel_init_thermal(c);
+}
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
new file mode 100644
index 000000000000..09cf78110358
--- /dev/null
+++ b/arch/x86/kernel/microcode.c
@@ -0,0 +1,850 @@
+/*
+ *      Intel CPU Microcode Update Driver for Linux
+ *
+ *      Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *                    2006      Shaohua Li <shaohua.li@intel.com>
+ *
+ *      This driver allows to upgrade microcode on Intel processors
+ *      belonging to IA-32 family - PentiumPro, Pentium II, 
+ *      Pentium III, Xeon, Pentium 4, etc.
+ *
+ *      Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, 
+ *      Order Number 245472 or free download from:
+ *              
+ *      http://developer.intel.com/design/pentium4/manuals/245472.htm
+ *
+ *      For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *      1.0     16 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *              Initial release.
+ *      1.01    18 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *              Added read() support + cleanups.
+ *      1.02    21 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *              Added 'device trimming' support. open(O_WRONLY) zeroes
+ *              and frees the saved copy of applied microcode.
+ *      1.03    29 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *              Made to use devfs (/dev/cpu/microcode) + cleanups.
+ *      1.04    06 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *              Added misc device support (now uses both devfs and misc).
+ *              Added MICROCODE_IOCFREE ioctl to clear memory.
+ *      1.05    09 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *              Messages for error cases (non Intel & no suitable microcode).
+ *      1.06    03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
+ *              Removed ->release(). Removed exclusive open and status bitmap.
+ *              Added microcode_rwsem to serialize read()/write()/ioctl().
+ *              Removed global kernel lock usage.
+ *      1.07    07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
+ *              Write 0 to 0x8B msr and then cpuid before reading revision,
+ *              so that it works even if there were no update done by the
+ *              BIOS. Otherwise, reading from 0x8B gives junk (which happened
+ *              to be 0 on my machine which is why it worked even when I
+ *              disabled update by the BIOS)
+ *              Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
+ *      1.08    11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
+ *                           Tigran Aivazian <tigran@veritas.com>
+ *              Intel Pentium 4 processor support and bugfixes.
+ *      1.09    30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
+ *              Bugfix for HT (Hyper-Threading) enabled processors
+ *              whereby processor resources are shared by all logical processors
+ *              in a single CPU package.
+ *      1.10    28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
+ *              Tigran Aivazian <tigran@veritas.com>,
+ *              Serialize updates as required on HT processors due to speculative
+ *              nature of implementation.
+ *      1.11    22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
+ *              Fix the panic when writing zero-length microcode chunk.
+ *      1.12    29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, 
+ *              Jun Nakajima <jun.nakajima@intel.com>
+ *              Support for the microcode updates in the new format.
+ *      1.13    10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
+ *              Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
+ *              because we no longer hold a copy of applied microcode 
+ *              in kernel memory.
+ *      1.14    25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
+ *              Fix sigmatch() macro to handle old CPUs with pf == 0.
+ *              Thanks to Stuart Swales for pointing out this bug.
+ */
+//#define DEBUG /* pr_debug */
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+#define MICROCODE_VERSION       "1.14a"
+#define DEFAULT_UCODE_DATASIZE  (2000)    /* 2000 bytes */
+#define MC_HEADER_SIZE          (sizeof (microcode_header_t))     /* 48 bytes */
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
+#define EXT_HEADER_SIZE         (sizeof (struct extended_sigtable)) /* 20 bytes */
+#define EXT_SIGNATURE_SIZE      (sizeof (struct extended_signature)) /* 12 bytes */
+#define DWSIZE                  (sizeof (u32))
+#define get_totalsize(mc) \
+        (((microcode_t *)mc)->hdr.totalsize ? \
+         ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
+#define get_datasize(mc) \
+        (((microcode_t *)mc)->hdr.datasize ? \
+         ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
+#define sigmatch(s1, s2, p1, p2) \
+        (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
+#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
+/* serialize access to the physical write to MSR 0x79 */
+static DEFINE_SPINLOCK(microcode_update_lock);
+/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+static DEFINE_MUTEX(microcode_mutex);
+static struct ucode_cpu_info {
+        int valid;
+        unsigned int sig;
+        unsigned int pf;
+        unsigned int rev;
+        microcode_t *mc;
+} ucode_cpu_info[NR_CPUS];
+static void collect_cpu_info(int cpu_num)
+{
+        struct cpuinfo_x86 *c = cpu_data + cpu_num;
+        struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+        unsigned int val[2];
+        /* We should bind the task to the CPU */
+        BUG_ON(raw_smp_processor_id() != cpu_num);
+        uci->pf = uci->rev = 0;
+        uci->mc = NULL;
+        uci->valid = 1;
+        if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+                cpu_has(c, X86_FEATURE_IA64)) {
+                printk(KERN_ERR "microcode: CPU%d not a capable Intel "
+                        "processor\n", cpu_num);
+                uci->valid = 0;
+                return;
+        }
+        uci->sig = cpuid_eax(0x00000001);
+        if ((c->x86_model >= 5) || (c->x86 > 6)) {
+                /* get processor flags from MSR 0x17 */
+                rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+                uci->pf = 1 << ((val[1] >> 18) & 7);
+        }
+        wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+        /* see notes above for revision 1.07.  Apparent chip bug */
+        sync_core();
+        /* get the current revision from MSR 0x8B */
+        rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
+        pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
+                        uci->sig, uci->pf, uci->rev);
+}
+static inline int microcode_update_match(int cpu_num,
+        microcode_header_t *mc_header, int sig, int pf)
+{
+        struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+        if (!sigmatch(sig, uci->sig, pf, uci->pf)
+                || mc_header->rev <= uci->rev)
+                return 0;
+        return 1;
+}
+static int microcode_sanity_check(void *mc)
+{
+        microcode_header_t *mc_header = mc;
+        struct extended_sigtable *ext_header = NULL;
+        struct extended_signature *ext_sig;
+        unsigned long total_size, data_size, ext_table_size;
+        int sum, orig_sum, ext_sigcount = 0, i;
+        total_size = get_totalsize(mc_header);
+        data_size = get_datasize(mc_header);
+        if (data_size + MC_HEADER_SIZE > total_size) {
+                printk(KERN_ERR "microcode: error! "
+                        "Bad data size in microcode data file\n");
+                return -EINVAL;
+        }
+        if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+                printk(KERN_ERR "microcode: error! "
+                        "Unknown microcode update format\n");
+                return -EINVAL;
+        }
+        ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+        if (ext_table_size) {
+                if ((ext_table_size < EXT_HEADER_SIZE)
+                 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+                        printk(KERN_ERR "microcode: error! "
+                                "Small exttable size in microcode data file\n");
+                        return -EINVAL;
+                }
+                ext_header = mc + MC_HEADER_SIZE + data_size;
+                if (ext_table_size != exttable_size(ext_header)) {
+                        printk(KERN_ERR "microcode: error! "
+                                "Bad exttable size in microcode data file\n");
+                        return -EFAULT;
+                }
+                ext_sigcount = ext_header->count;
+        }
+        /* check extended table checksum */
+        if (ext_table_size) {
+                int ext_table_sum = 0;
+                int *ext_tablep = (int *)ext_header;
+                i = ext_table_size / DWSIZE;
+                while (i--)
+                        ext_table_sum += ext_tablep[i];
+                if (ext_table_sum) {
+                        printk(KERN_WARNING "microcode: aborting, "
+                                "bad extended signature table checksum\n");
+                        return -EINVAL;
+                }
+        }
+        /* calculate the checksum */
+        orig_sum = 0;
+        i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+        while (i--)
+                orig_sum += ((int *)mc)[i];
+        if (orig_sum) {
+                printk(KERN_ERR "microcode: aborting, bad checksum\n");
+                return -EINVAL;
+        }
+        if (!ext_table_size)
+                return 0;
+        /* check extended signature checksum */
+        for (i = 0; i < ext_sigcount; i++) {
+                ext_sig = (struct extended_signature *)((void *)ext_header
+                        + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i);
+                sum = orig_sum
+                        - (mc_header->sig + mc_header->pf + mc_header->cksum)
+                        + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+                if (sum) {
+                        printk(KERN_ERR "microcode: aborting, bad checksum\n");
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ * return < 0 - error
+ */
+static int get_maching_microcode(void *mc, int cpu)
+{
+        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+        microcode_header_t *mc_header = mc;
+        struct extended_sigtable *ext_header;
+        unsigned long total_size = get_totalsize(mc_header);
+        int ext_sigcount, i;
+        struct extended_signature *ext_sig;
+        void *new_mc;
+        if (microcode_update_match(cpu, mc_header,
+                        mc_header->sig, mc_header->pf))
+                goto find;
+        if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+                return 0;
+        ext_header = (struct extended_sigtable *)(mc +
+                        get_datasize(mc_header) + MC_HEADER_SIZE);
+        ext_sigcount = ext_header->count;
+        ext_sig = (struct extended_signature *)((void *)ext_header
+                        + EXT_HEADER_SIZE);
+        for (i = 0; i < ext_sigcount; i++) {
+                if (microcode_update_match(cpu, mc_header,
+                                ext_sig->sig, ext_sig->pf))
+                        goto find;
+                ext_sig++;
+        }
+        return 0;
+find:
+        pr_debug("microcode: CPU %d found a matching microcode update with"
+                " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
+        new_mc = vmalloc(total_size);
+        if (!new_mc) {
+                printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+                return -ENOMEM;
+        }
+        /* free previous update file */
+        vfree(uci->mc);
+        memcpy(new_mc, mc, total_size);
+        uci->mc = new_mc;
+        return 1;
+}
+static void apply_microcode(int cpu)
+{
+        unsigned long flags;
+        unsigned int val[2];
+        int cpu_num = raw_smp_processor_id();
+        struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+        /* We should bind the task to the CPU */
+        BUG_ON(cpu_num != cpu);
+        if (uci->mc == NULL)
+                return;
+        /* serialize access to the physical write to MSR 0x79 */
+        spin_lock_irqsave(&microcode_update_lock, flags);          
+        /* write microcode via MSR 0x79 */
+        wrmsr(MSR_IA32_UCODE_WRITE,
+                (unsigned long) uci->mc->bits, 
+                (unsigned long) uci->mc->bits >> 16 >> 16);
+        wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+        /* see notes above for revision 1.07.  Apparent chip bug */
+        sync_core();
+        /* get the current revision from MSR 0x8B */
+        rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+        spin_unlock_irqrestore(&microcode_update_lock, flags);
+        if (val[1] != uci->mc->hdr.rev) {
+                printk(KERN_ERR "microcode: CPU%d updated from revision "
+                        "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
+                return;
+        }
+        pr_debug("microcode: CPU%d updated from revision "
+               "0x%x to 0x%x, date = %08x \n", 
+               cpu_num, uci->rev, val[1], uci->mc->hdr.date);
+        uci->rev = val[1];
+}
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static void __user *user_buffer;        /* user area microcode data buffer */
+static unsigned int user_buffer_size;   /* it's size */
+static long get_next_ucode(void **mc, long offset)
+{
+        microcode_header_t mc_header;
+        unsigned long total_size;
+        /* No more data */
+        if (offset >= user_buffer_size)
+                return 0;
+        if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
+                printk(KERN_ERR "microcode: error! Can not read user data\n");
+                return -EFAULT;
+        }
+        total_size = get_totalsize(&mc_header);
+        if (offset + total_size > user_buffer_size) {
+                printk(KERN_ERR "microcode: error! Bad total size in microcode "
+                                "data file\n");
+                return -EINVAL;
+        }
+        *mc = vmalloc(total_size);
+        if (!*mc)
+                return -ENOMEM;
+        if (copy_from_user(*mc, user_buffer + offset, total_size)) {
+                printk(KERN_ERR "microcode: error! Can not read user data\n");
+                vfree(*mc);
+                return -EFAULT;
+        }
+        return offset + total_size;
+}
+static int do_microcode_update (void)
+{
+        long cursor = 0;
+        int error = 0;
+        void *new_mc = NULL;
+        int cpu;
+        cpumask_t old;
+        old = current->cpus_allowed;
+        while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
+                error = microcode_sanity_check(new_mc);
+                if (error)
+                        goto out;
+                /*
+                 * It's possible the data file has multiple matching ucode,
+                 * lets keep searching till the latest version
+                 */
+                for_each_online_cpu(cpu) {
+                        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+                        if (!uci->valid)
+                                continue;
+                        set_cpus_allowed(current, cpumask_of_cpu(cpu));
+                        error = get_maching_microcode(new_mc, cpu);
+                        if (error < 0)
+                                goto out;
+                        if (error == 1)
+                                apply_microcode(cpu);
+                }
+                vfree(new_mc);
+        }
+out:
+        if (cursor > 0)
+                vfree(new_mc);
+        if (cursor < 0)
+                error = cursor;
+        set_cpus_allowed(current, old);
+        return error;
+}
+static int microcode_open (struct inode *unused1, struct file *unused2)
+{
+        return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
+{
+        ssize_t ret;
+        if ((len >> PAGE_SHIFT) > num_physpages) {
+                printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
+                return -EINVAL;
+        }
+        lock_cpu_hotplug();
+        mutex_lock(&microcode_mutex);
+        user_buffer = (void __user *) buf;
+        user_buffer_size = (int) len;
+        ret = do_microcode_update();
+        if (!ret)
+                ret = (ssize_t)len;
+        mutex_unlock(&microcode_mutex);
+        unlock_cpu_hotplug();
+        return ret;
+}
+static const struct file_operations microcode_fops = {
+        .owner          = THIS_MODULE,
+        .write          = microcode_write,
+        .open           = microcode_open,
+};
+static struct miscdevice microcode_dev = {
+        .minor          = MICROCODE_MINOR,
+        .name           = "microcode",
+        .fops           = &microcode_fops,
+};
+static int __init microcode_dev_init (void)
+{
+        int error;
+        error = misc_register(&microcode_dev);
+        if (error) {
+                printk(KERN_ERR
+                        "microcode: can't misc_register on minor=%d\n",
+                        MICROCODE_MINOR);
+                return error;
+        }
+        return 0;
+}
+static void microcode_dev_exit (void)
+{
+        misc_deregister(&microcode_dev);
+}
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+#else
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while(0)
+#endif
+static long get_next_ucode_from_buffer(void **mc, void *buf,
+        unsigned long size, long offset)
+{
+        microcode_header_t *mc_header;
+        unsigned long total_size;
+        /* No more data */
+        if (offset >= size)
+                return 0;
+        mc_header = (microcode_header_t *)(buf + offset);
+        total_size = get_totalsize(mc_header);
+        if (offset + total_size > size) {
+                printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
+                return -EINVAL;
+        }
+        *mc = vmalloc(total_size);
+        if (!*mc) {
+                printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+                return -ENOMEM;
+        }
+        memcpy(*mc, buf + offset, total_size);
+        return offset + total_size;
+}
+/* fake device for request_firmware */
+static struct platform_device *microcode_pdev;
+static int cpu_request_microcode(int cpu)
+{
+        char name[30];
+        struct cpuinfo_x86 *c = cpu_data + cpu;
+        const struct firmware *firmware;
+        void *buf;
+        unsigned long size;
+        long offset = 0;
+        int error;
+        void *mc;
+        /* We should bind the task to the CPU */
+        BUG_ON(cpu != raw_smp_processor_id());
+        sprintf(name,"intel-ucode/%02x-%02x-%02x",
+                c->x86, c->x86_model, c->x86_mask);
+        error = request_firmware(&firmware, name, &microcode_pdev->dev);
+        if (error) {
+                pr_debug("ucode data file %s load failed\n", name);
+                return error;
+        }
+        buf = (void *)firmware->data;
+        size = firmware->size;
+        while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
+                        > 0) {
+                error = microcode_sanity_check(mc);
+                if (error)
+                        break;
+                error = get_maching_microcode(mc, cpu);
+                if (error < 0)
+                        break;
+                /*
+                 * It's possible the data file has multiple matching ucode,
+                 * lets keep searching till the latest version
+                 */
+                if (error == 1) {
+                        apply_microcode(cpu);
+                        error = 0;
+                }
+                vfree(mc);
+        }
+        if (offset > 0)
+                vfree(mc);
+        if (offset < 0)
+                error = offset;
+        release_firmware(firmware);
+        return error;
+}
+static int apply_microcode_check_cpu(int cpu)
+{
+        struct cpuinfo_x86 *c = cpu_data + cpu;
+        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+        cpumask_t old;
+        unsigned int val[2];
+        int err = 0;
+        /* Check if the microcode is available */
+        if (!uci->mc)
+                return 0;
+        old = current->cpus_allowed;
+        set_cpus_allowed(current, cpumask_of_cpu(cpu));
+        /* Check if the microcode we have in memory matches the CPU */
+        if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+            cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
+                err = -EINVAL;
+        if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
+                /* get processor flags from MSR 0x17 */
+                rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+                if (uci->pf != (1 << ((val[1] >> 18) & 7)))
+                        err = -EINVAL;
+        }
+        if (!err) {
+                wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+                /* see notes above for revision 1.07.  Apparent chip bug */
+                sync_core();
+                /* get the current revision from MSR 0x8B */
+                rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+                if (uci->rev != val[1])
+                        err = -EINVAL;
+        }
+        if (!err)
+                apply_microcode(cpu);
+        else
+                printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
+                        " sig=0x%x, pf=0x%x, rev=0x%x\n",
+                        cpu, uci->sig, uci->pf, uci->rev);
+        set_cpus_allowed(current, old);
+        return err;
+}
+static void microcode_init_cpu(int cpu, int resume)
+{
+        cpumask_t old;
+        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+        old = current->cpus_allowed;
+        set_cpus_allowed(current, cpumask_of_cpu(cpu));
+        mutex_lock(&microcode_mutex);
+        collect_cpu_info(cpu);
+        if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
+                cpu_request_microcode(cpu);
+        mutex_unlock(&microcode_mutex);
+        set_cpus_allowed(current, old);
+}
+static void microcode_fini_cpu(int cpu)
+{
+        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+        mutex_lock(&microcode_mutex);
+        uci->valid = 0;
+        vfree(uci->mc);
+        uci->mc = NULL;
+        mutex_unlock(&microcode_mutex);
+}
+static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
+{
+        struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+        char *end;
+        unsigned long val = simple_strtoul(buf, &end, 0);
+        int err = 0;
+        int cpu = dev->id;
+        if (end == buf)
+                return -EINVAL;
+        if (val == 1) {
+                cpumask_t old;
+                old = current->cpus_allowed;
+                lock_cpu_hotplug();
+                set_cpus_allowed(current, cpumask_of_cpu(cpu));
+                mutex_lock(&microcode_mutex);
+                if (uci->valid)
+                        err = cpu_request_microcode(cpu);
+                mutex_unlock(&microcode_mutex);
+                unlock_cpu_hotplug();
+                set_cpus_allowed(current, old);
+        }
+        if (err)
+                return err;
+        return sz;
+}
+static ssize_t version_show(struct sys_device *dev, char *buf)
+{
+        struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+        return sprintf(buf, "0x%x\n", uci->rev);
+}
+static ssize_t pf_show(struct sys_device *dev, char *buf)
+{
+        struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+        return sprintf(buf, "0x%x\n", uci->pf);
+}
+static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
+static SYSDEV_ATTR(version, 0400, version_show, NULL);
+static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+static struct attribute *mc_default_attrs[] = {
+        &attr_reload.attr,
+        &attr_version.attr,
+        &attr_processor_flags.attr,
+        NULL
+};
+static struct attribute_group mc_attr_group = {
+        .attrs = mc_default_attrs,
+        .name = "microcode",
+};
+static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
+{
+        int err, cpu = sys_dev->id;
+        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+        if (!cpu_online(cpu))
+                return 0;
+        pr_debug("Microcode:CPU %d added\n", cpu);
+        memset(uci, 0, sizeof(*uci));
+        err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+        if (err)
+                return err;
+        microcode_init_cpu(cpu, resume);
+        return 0;
+}
+static int mc_sysdev_add(struct sys_device *sys_dev)
+{
+        return __mc_sysdev_add(sys_dev, 0);
+}
+static int mc_sysdev_remove(struct sys_device *sys_dev)
+{
+        int cpu = sys_dev->id;
+        if (!cpu_online(cpu))
+                return 0;
+        pr_debug("Microcode:CPU %d removed\n", cpu);
+        microcode_fini_cpu(cpu);
+        sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+        return 0;
+}
+static int mc_sysdev_resume(struct sys_device *dev)
+{
+        int cpu = dev->id;
+        if (!cpu_online(cpu))
+                return 0;
+        pr_debug("Microcode:CPU %d resumed\n", cpu);
+        /* only CPU 0 will apply ucode here */
+        apply_microcode(0);
+        return 0;
+}
+static struct sysdev_driver mc_sysdev_driver = {
+        .add = mc_sysdev_add,
+        .remove = mc_sysdev_remove,
+        .resume = mc_sysdev_resume,
+};
+static __cpuinit int
+mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+{
+        unsigned int cpu = (unsigned long)hcpu;
+        struct sys_device *sys_dev;
+        sys_dev = get_cpu_sysdev(cpu);
+        switch (action) {
+        case CPU_UP_CANCELED_FROZEN:
+                /* The CPU refused to come up during a system resume */
+                microcode_fini_cpu(cpu);
+                break;
+        case CPU_ONLINE:
+        case CPU_DOWN_FAILED:
+                mc_sysdev_add(sys_dev);
+                break;
+        case CPU_ONLINE_FROZEN:
+                /* System-wide resume is in progress, try to apply microcode */
+                if (apply_microcode_check_cpu(cpu)) {
+                        /* The application of microcode failed */
+                        microcode_fini_cpu(cpu);
+                        __mc_sysdev_add(sys_dev, 1);
+                        break;
+                }
+        case CPU_DOWN_FAILED_FROZEN:
+                if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
+                        printk(KERN_ERR "Microcode: Failed to create the sysfs "
+                                "group for CPU%d\n", cpu);
+                break;
+        case CPU_DOWN_PREPARE:
+                mc_sysdev_remove(sys_dev);
+                break;
+        case CPU_DOWN_PREPARE_FROZEN:
+                /* Suspend is in progress, only remove the interface */
+                sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block __cpuinitdata mc_cpu_notifier = {
+        .notifier_call = mc_cpu_callback,
+};
+static int __init microcode_init (void)
+{
+        int error;
+        error = microcode_dev_init();
+        if (error)
+                return error;
+        microcode_pdev = platform_device_register_simple("microcode", -1,
+                                                         NULL, 0);
+        if (IS_ERR(microcode_pdev)) {
+                microcode_dev_exit();
+                return PTR_ERR(microcode_pdev);
+        }
+        lock_cpu_hotplug();
+        error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+        unlock_cpu_hotplug();
+        if (error) {
+                microcode_dev_exit();
+                platform_device_unregister(microcode_pdev);
+                return error;
+        }
+        register_hotcpu_notifier(&mc_cpu_notifier);
+        printk(KERN_INFO 
+                "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
+        return 0;
+}
+static void __exit microcode_exit (void)
+{
+        microcode_dev_exit();
+        unregister_hotcpu_notifier(&mc_cpu_notifier);
+        lock_cpu_hotplug();
+        sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+        unlock_cpu_hotplug();
+        platform_device_unregister(microcode_pdev);
+}
+module_init(microcode_init)
+module_exit(microcode_exit)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
new file mode 100644
index 000000000000..3db0a5442eb1
--- /dev/null
+++ b/arch/x86/kernel/module_32.c
@@ -0,0 +1,152 @@
+/*  Kernel module help for i386.
+    Copyright (C) 2001 Rusty Russell.
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/moduleloader.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/bug.h>
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt...)
+#endif
+void *module_alloc(unsigned long size)
+{
+        if (size == 0)
+                return NULL;
+        return vmalloc_exec(size);
+}
+/* Free memory returned from module_alloc */
+void module_free(struct module *mod, void *module_region)
+{
+        vfree(module_region);
+        /* FIXME: If module_region == mod->init_region, trim exception
+           table entries. */
+}
+/* We don't need anything special. */
+int module_frob_arch_sections(Elf_Ehdr *hdr,
+                              Elf_Shdr *sechdrs,
+                              char *secstrings,
+                              struct module *mod)
+{
+        return 0;
+}
+int apply_relocate(Elf32_Shdr *sechdrs,
+                   const char *strtab,
+                   unsigned int symindex,
+                   unsigned int relsec,
+                   struct module *me)
+{
+        unsigned int i;
+        Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
+        Elf32_Sym *sym;
+        uint32_t *location;
+        DEBUGP("Applying relocate section %u to %u\n", relsec,
+               sechdrs[relsec].sh_info);
+        for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+                /* This is where to make the change */
+                location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+                        + rel[i].r_offset;
+                /* This is the symbol it is referring to.  Note that all
+                   undefined symbols have been resolved.  */
+                sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
+                        + ELF32_R_SYM(rel[i].r_info);
+                switch (ELF32_R_TYPE(rel[i].r_info)) {
+                case R_386_32:
+                        /* We add the value into the location given */
+                        *location += sym->st_value;
+                        break;
+                case R_386_PC32:
+                        /* Add the value, subtract its postition */
+                        *location += sym->st_value - (uint32_t)location;
+                        break;
+                default:
+                        printk(KERN_ERR "module %s: Unknown relocation: %u\n",
+                               me->name, ELF32_R_TYPE(rel[i].r_info));
+                        return -ENOEXEC;
+                }
+        }
+        return 0;
+}
+int apply_relocate_add(Elf32_Shdr *sechdrs,
+                       const char *strtab,
+                       unsigned int symindex,
+                       unsigned int relsec,
+                       struct module *me)
+{
+        printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
+               me->name);
+        return -ENOEXEC;
+}
+int module_finalize(const Elf_Ehdr *hdr,
+                    const Elf_Shdr *sechdrs,
+                    struct module *me)
+{
+        const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+                *para = NULL;
+        char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+        for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 
+                if (!strcmp(".text", secstrings + s->sh_name))
+                        text = s;
+                if (!strcmp(".altinstructions", secstrings + s->sh_name))
+                        alt = s;
+                if (!strcmp(".smp_locks", secstrings + s->sh_name))
+                        locks= s;
+                if (!strcmp(".parainstructions", secstrings + s->sh_name))
+                        para = s;
+        }
+        if (alt) {
+                /* patch .altinstructions */
+                void *aseg = (void *)alt->sh_addr;
+                apply_alternatives(aseg, aseg + alt->sh_size);
+        }
+        if (locks && text) {
+                void *lseg = (void *)locks->sh_addr;
+                void *tseg = (void *)text->sh_addr;
+                alternatives_smp_module_add(me, me->name,
+                                            lseg, lseg + locks->sh_size,
+                                            tseg, tseg + text->sh_size);
+        }
+        if (para) {
+                void *pseg = (void *)para->sh_addr;
+                apply_paravirt(pseg, pseg + para->sh_size);
+        }
+        return module_bug_finalize(hdr, sechdrs, me);
+}
+void module_arch_cleanup(struct module *mod)
+{
+        alternatives_smp_module_del(mod);
+        module_bug_cleanup(mod);
+}
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
new file mode 100644
index 000000000000..a888e67f5874
--- /dev/null
+++ b/arch/x86/kernel/module_64.c
@@ -0,0 +1,185 @@
+/*  Kernel module help for x86-64
+    Copyright (C) 2001 Rusty Russell.
+    Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/moduleloader.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/bug.h>
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#define DEBUGP(fmt...) 
+#ifndef CONFIG_UML
+void module_free(struct module *mod, void *module_region)
+{
+        vfree(module_region);
+        /* FIXME: If module_region == mod->init_region, trim exception
+           table entries. */
+}
+void *module_alloc(unsigned long size)
+{
+        struct vm_struct *area;
+        if (!size)
+                return NULL;
+        size = PAGE_ALIGN(size);
+        if (size > MODULES_LEN)
+                return NULL;
+        area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
+        if (!area)
+                return NULL;
+        return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
+}
+#endif
+/* We don't need anything special. */
+int module_frob_arch_sections(Elf_Ehdr *hdr,
+                              Elf_Shdr *sechdrs,
+                              char *secstrings,
+                              struct module *mod)
+{
+        return 0;
+}
+int apply_relocate_add(Elf64_Shdr *sechdrs,
+                   const char *strtab,
+                   unsigned int symindex,
+                   unsigned int relsec,
+                   struct module *me)
+{
+        unsigned int i;
+        Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
+        Elf64_Sym *sym;
+        void *loc;
+        u64 val; 
+        DEBUGP("Applying relocate section %u to %u\n", relsec,
+               sechdrs[relsec].sh_info);
+        for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+                /* This is where to make the change */
+                loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+                        + rel[i].r_offset;
+                /* This is the symbol it is referring to.  Note that all
+                   undefined symbols have been resolved.  */
+                sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
+                        + ELF64_R_SYM(rel[i].r_info);
+                DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
+                       (int)ELF64_R_TYPE(rel[i].r_info), 
+                       sym->st_value, rel[i].r_addend, (u64)loc);
+                val = sym->st_value + rel[i].r_addend; 
+                switch (ELF64_R_TYPE(rel[i].r_info)) {
+                case R_X86_64_NONE:
+                        break;
+                case R_X86_64_64:
+                        *(u64 *)loc = val;
+                        break;
+                case R_X86_64_32:
+                        *(u32 *)loc = val;
+                        if (val != *(u32 *)loc)
+                                goto overflow;
+                        break;
+                case R_X86_64_32S:
+                        *(s32 *)loc = val;
+                        if ((s64)val != *(s32 *)loc)
+                                goto overflow;
+                        break;
+                case R_X86_64_PC32: 
+                        val -= (u64)loc;
+                        *(u32 *)loc = val;
+#if 0
+                        if ((s64)val != *(s32 *)loc)
+                                goto overflow; 
+#endif
+                        break;
+                default:
+                        printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n",
+                               me->name, ELF64_R_TYPE(rel[i].r_info));
+                        return -ENOEXEC;
+                }
+        }
+        return 0;
+overflow:
+        printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 
+               (int)ELF64_R_TYPE(rel[i].r_info), val);
+        printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
+               me->name);
+        return -ENOEXEC;
+}
+int apply_relocate(Elf_Shdr *sechdrs,
+                   const char *strtab,
+                   unsigned int symindex,
+                   unsigned int relsec,
+                   struct module *me)
+{
+        printk("non add relocation not supported\n");
+        return -ENOSYS;
+} 
+int module_finalize(const Elf_Ehdr *hdr,
+                    const Elf_Shdr *sechdrs,
+                    struct module *me)
+{
+        const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
+        char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+        for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+                if (!strcmp(".text", secstrings + s->sh_name))
+                        text = s;
+                if (!strcmp(".altinstructions", secstrings + s->sh_name))
+                        alt = s;
+                if (!strcmp(".smp_locks", secstrings + s->sh_name))
+                        locks= s;
+        }
+        if (alt) {
+                /* patch .altinstructions */
+                void *aseg = (void *)alt->sh_addr;
+                apply_alternatives(aseg, aseg + alt->sh_size);
+        }
+        if (locks && text) {
+                void *lseg = (void *)locks->sh_addr;
+                void *tseg = (void *)text->sh_addr;
+                alternatives_smp_module_add(me, me->name,
+                                            lseg, lseg + locks->sh_size,
+                                            tseg, tseg + text->sh_size);
+        }
+        return module_bug_finalize(hdr, sechdrs, me);
+}
+void module_arch_cleanup(struct module *mod)
+{
+        alternatives_smp_module_del(mod);
+        module_bug_cleanup(mod);
+}
diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
new file mode 100644
index 000000000000..13abb4ebfb79
--- /dev/null
+++ b/arch/x86/kernel/mpparse_32.c
@@ -0,0 +1,1132 @@
+/*
+ *      Intel Multiprocessor Specification 1.1 and 1.4
+ *      compliant MP-table parsing routines.
+ *
+ *      (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *      (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *      Fixes
+ *              Erich Boleyn    :       MP v1.4 and additional changes.
+ *              Alan Cox        :       Added EBDA scanning
+ *              Ingo Molnar     :       various cleanups and rewrites
+ *              Maciej W. Rozycki:      Bits for default MP configurations
+ *              Paul Diefenbaugh:       Added full ACPI support
+ */
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/bitops.h>
+#include <asm/smp.h>
+#include <asm/acpi.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/io_apic.h>
+#include <mach_apic.h>
+#include <mach_apicdef.h>
+#include <mach_mpparse.h>
+#include <bios_ebda.h>
+/* Have we found an MP table */
+int smp_found_config;
+unsigned int __cpuinitdata maxcpus = NR_CPUS;
+/*
+ * Various Linux-internal data structures created from the
+ * MP-table.
+ */
+int apic_version [MAX_APICS];
+int mp_bus_id_to_type [MAX_MP_BUSSES];
+int mp_bus_id_to_node [MAX_MP_BUSSES];
+int mp_bus_id_to_local [MAX_MP_BUSSES];
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+static int mp_current_pci_id;
+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+/* # of MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+/* MP IRQ source entries */
+int mp_irq_entries;
+int nr_ioapics;
+int pic_mode;
+unsigned long mp_lapic_addr;
+unsigned int def_to_bigsmp = 0;
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+/* Internal processor count */
+unsigned int __cpuinitdata num_processors;
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map;
+u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+/*
+ * Intel MP BIOS table parsing routines:
+ */
+/*
+ * Checksum an MP configuration block.
+ */
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+        int sum = 0;
+        while (len--)
+                sum += *mp++;
+        return sum & 0xFF;
+}
+/*
+ * Have to match translation table entries to main table entries by counter
+ * hence the mpc_record variable .... can't see a less disgusting way of
+ * doing this ....
+ */
+static int mpc_record; 
+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
+static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
+{
+        int ver, apicid;
+        physid_mask_t phys_cpu;
+        
+        if (!(m->mpc_cpuflag & CPU_ENABLED))
+                return;
+        apicid = mpc_apic_id(m, translation_table[mpc_record]);
+        if (m->mpc_featureflag&(1<<0))
+                Dprintk("    Floating point unit present.\n");
+        if (m->mpc_featureflag&(1<<7))
+                Dprintk("    Machine Exception supported.\n");
+        if (m->mpc_featureflag&(1<<8))
+                Dprintk("    64 bit compare & exchange supported.\n");
+        if (m->mpc_featureflag&(1<<9))
+                Dprintk("    Internal APIC present.\n");
+        if (m->mpc_featureflag&(1<<11))
+                Dprintk("    SEP present.\n");
+        if (m->mpc_featureflag&(1<<12))
+                Dprintk("    MTRR  present.\n");
+        if (m->mpc_featureflag&(1<<13))
+                Dprintk("    PGE  present.\n");
+        if (m->mpc_featureflag&(1<<14))
+                Dprintk("    MCA  present.\n");
+        if (m->mpc_featureflag&(1<<15))
+                Dprintk("    CMOV  present.\n");
+        if (m->mpc_featureflag&(1<<16))
+                Dprintk("    PAT  present.\n");
+        if (m->mpc_featureflag&(1<<17))
+                Dprintk("    PSE  present.\n");
+        if (m->mpc_featureflag&(1<<18))
+                Dprintk("    PSN  present.\n");
+        if (m->mpc_featureflag&(1<<19))
+                Dprintk("    Cache Line Flush Instruction present.\n");
+        /* 20 Reserved */
+        if (m->mpc_featureflag&(1<<21))
+                Dprintk("    Debug Trace and EMON Store present.\n");
+        if (m->mpc_featureflag&(1<<22))
+                Dprintk("    ACPI Thermal Throttle Registers  present.\n");
+        if (m->mpc_featureflag&(1<<23))
+                Dprintk("    MMX  present.\n");
+        if (m->mpc_featureflag&(1<<24))
+                Dprintk("    FXSR  present.\n");
+        if (m->mpc_featureflag&(1<<25))
+                Dprintk("    XMM  present.\n");
+        if (m->mpc_featureflag&(1<<26))
+                Dprintk("    Willamette New Instructions  present.\n");
+        if (m->mpc_featureflag&(1<<27))
+                Dprintk("    Self Snoop  present.\n");
+        if (m->mpc_featureflag&(1<<28))
+                Dprintk("    HT  present.\n");
+        if (m->mpc_featureflag&(1<<29))
+                Dprintk("    Thermal Monitor present.\n");
+        /* 30, 31 Reserved */
+        if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+                Dprintk("    Bootup CPU\n");
+                boot_cpu_physical_apicid = m->mpc_apicid;
+        }
+        ver = m->mpc_apicver;
+        /*
+         * Validate version
+         */
+        if (ver == 0x0) {
+                printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
+                                "fixing up to 0x10. (tell your hw vendor)\n",
+                                m->mpc_apicid);
+                ver = 0x10;
+        }
+        apic_version[m->mpc_apicid] = ver;
+        phys_cpu = apicid_to_cpu_present(apicid);
+        physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
+        if (num_processors >= NR_CPUS) {
+                printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+                        "  Processor ignored.\n", NR_CPUS);
+                return;
+        }
+        if (num_processors >= maxcpus) {
+                printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
+                        " Processor ignored.\n", maxcpus);
+                return;
+        }
+        cpu_set(num_processors, cpu_possible_map);
+        num_processors++;
+        /*
+         * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
+         * but we need to work other dependencies like SMP_SUSPEND etc
+         * before this can be done without some confusion.
+         * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
+         *       - Ashok Raj <ashok.raj@intel.com>
+         */
+        if (num_processors > 8) {
+                switch (boot_cpu_data.x86_vendor) {
+                case X86_VENDOR_INTEL:
+                        if (!APIC_XAPIC(ver)) {
+                                def_to_bigsmp = 0;
+                                break;
+                        }
+                        /* If P4 and above fall through */
+                case X86_VENDOR_AMD:
+                        def_to_bigsmp = 1;
+                }
+        }
+        bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
+}
+static void __init MP_bus_info (struct mpc_config_bus *m)
+{
+        char str[7];
+        memcpy(str, m->mpc_bustype, 6);
+        str[6] = 0;
+        mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+#if MAX_MP_BUSSES < 256
+        if (m->mpc_busid >= MAX_MP_BUSSES) {
+                printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
+                        " is too large, max. supported is %d\n",
+                        m->mpc_busid, str, MAX_MP_BUSSES - 1);
+                return;
+        }
+#endif
+        if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
+                mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
+        } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
+                mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+        } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
+                mpc_oem_pci_bus(m, translation_table[mpc_record]);
+                mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+                mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
+                mp_current_pci_id++;
+        } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
+                mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
+        } else {
+                printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+        }
+}
+static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
+{
+        if (!(m->mpc_flags & MPC_APIC_USABLE))
+                return;
+        printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
+                m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+        if (nr_ioapics >= MAX_IO_APICS) {
+                printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
+                        MAX_IO_APICS, nr_ioapics);
+                panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
+        }
+        if (!m->mpc_apicaddr) {
+                printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
+                        " found in MP table, skipping!\n");
+                return;
+        }
+        mp_ioapics[nr_ioapics] = *m;
+        nr_ioapics++;
+}
+static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
+{
+        mp_irqs [mp_irq_entries] = *m;
+        Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+                " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+                        m->mpc_irqtype, m->mpc_irqflag & 3,
+                        (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
+                        m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+        if (++mp_irq_entries == MAX_IRQ_SOURCES)
+                panic("Max # of irq sources exceeded!!\n");
+}
+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
+{
+        Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+                " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+                        m->mpc_irqtype, m->mpc_irqflag & 3,
+                        (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
+                        m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+}
+#ifdef CONFIG_X86_NUMAQ
+static void __init MP_translation_info (struct mpc_config_translation *m)
+{
+        printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
+        if (mpc_record >= MAX_MPC_ENTRY) 
+                printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
+        else
+                translation_table[mpc_record] = m; /* stash this for later */
+        if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
+                node_set_online(m->trans_quad);
+}
+/*
+ * Read/parse the MPC oem tables
+ */
+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
+        unsigned short oemsize)
+{
+        int count = sizeof (*oemtable); /* the header size */
+        unsigned char *oemptr = ((unsigned char *)oemtable)+count;
+        
+        mpc_record = 0;
+        printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
+        if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
+        {
+                printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+                        oemtable->oem_signature[0],
+                        oemtable->oem_signature[1],
+                        oemtable->oem_signature[2],
+                        oemtable->oem_signature[3]);
+                return;
+        }
+        if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
+        {
+                printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
+                return;
+        }
+        while (count < oemtable->oem_length) {
+                switch (*oemptr) {
+                        case MP_TRANSLATION:
+                        {
+                                struct mpc_config_translation *m=
+                                        (struct mpc_config_translation *)oemptr;
+                                MP_translation_info(m);
+                                oemptr += sizeof(*m);
+                                count += sizeof(*m);
+                                ++mpc_record;
+                                break;
+                        }
+                        default:
+                        {
+                                printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
+                                return;
+                        }
+                }
+       }
+}
+static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
+                char *productid)
+{
+        if (strncmp(oem, "IBM NUMA", 8))
+                printk("Warning!  May not be a NUMA-Q system!\n");
+        if (mpc->mpc_oemptr)
+                smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
+                                mpc->mpc_oemsize);
+}
+#endif  /* CONFIG_X86_NUMAQ */
+/*
+ * Read/parse the MPC
+ */
+static int __init smp_read_mpc(struct mp_config_table *mpc)
+{
+        char str[16];
+        char oem[10];
+        int count=sizeof(*mpc);
+        unsigned char *mpt=((unsigned char *)mpc)+count;
+        if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
+                printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
+                        *(u32 *)mpc->mpc_signature);
+                return 0;
+        }
+        if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
+                printk(KERN_ERR "SMP mptable: checksum error!\n");
+                return 0;
+        }
+        if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
+                printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
+                        mpc->mpc_spec);
+                return 0;
+        }
+        if (!mpc->mpc_lapic) {
+                printk(KERN_ERR "SMP mptable: null local APIC address!\n");
+                return 0;
+        }
+        memcpy(oem,mpc->mpc_oem,8);
+        oem[8]=0;
+        printk(KERN_INFO "OEM ID: %s ",oem);
+        memcpy(str,mpc->mpc_productid,12);
+        str[12]=0;
+        printk("Product ID: %s ",str);
+        mps_oem_check(mpc, oem, str);
+        printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
+        /* 
+         * Save the local APIC address (it might be non-default) -- but only
+         * if we're not using ACPI.
+         */
+        if (!acpi_lapic)
+                mp_lapic_addr = mpc->mpc_lapic;
+        /*
+         *      Now process the configuration blocks.
+         */
+        mpc_record = 0;
+        while (count < mpc->mpc_length) {
+                switch(*mpt) {
+                        case MP_PROCESSOR:
+                        {
+                                struct mpc_config_processor *m=
+                                        (struct mpc_config_processor *)mpt;
+                                /* ACPI may have already provided this data */
+                                if (!acpi_lapic)
+                                        MP_processor_info(m);
+                                mpt += sizeof(*m);
+                                count += sizeof(*m);
+                                break;
+                        }
+                        case MP_BUS:
+                        {
+                                struct mpc_config_bus *m=
+                                        (struct mpc_config_bus *)mpt;
+                                MP_bus_info(m);
+                                mpt += sizeof(*m);
+                                count += sizeof(*m);
+                                break;
+                        }
+                        case MP_IOAPIC:
+                        {
+                                struct mpc_config_ioapic *m=
+                                        (struct mpc_config_ioapic *)mpt;
+                                MP_ioapic_info(m);
+                                mpt+=sizeof(*m);
+                                count+=sizeof(*m);
+                                break;
+                        }
+                        case MP_INTSRC:
+                        {
+                                struct mpc_config_intsrc *m=
+                                        (struct mpc_config_intsrc *)mpt;
+                                MP_intsrc_info(m);
+                                mpt+=sizeof(*m);
+                                count+=sizeof(*m);
+                                break;
+                        }
+                        case MP_LINTSRC:
+                        {
+                                struct mpc_config_lintsrc *m=
+                                        (struct mpc_config_lintsrc *)mpt;
+                                MP_lintsrc_info(m);
+                                mpt+=sizeof(*m);
+                                count+=sizeof(*m);
+                                break;
+                        }
+                        default:
+                        {
+                                count = mpc->mpc_length;
+                                break;
+                        }
+                }
+                ++mpc_record;
+        }
+        setup_apic_routing();
+        if (!num_processors)
+                printk(KERN_ERR "SMP mptable: no processors registered!\n");
+        return num_processors;
+}
+static int __init ELCR_trigger(unsigned int irq)
+{
+        unsigned int port;
+        port = 0x4d0 + (irq >> 3);
+        return (inb(port) >> (irq & 7)) & 1;
+}
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+        struct mpc_config_intsrc intsrc;
+        int i;
+        int ELCR_fallback = 0;
+        intsrc.mpc_type = MP_INTSRC;
+        intsrc.mpc_irqflag = 0;                 /* conforming */
+        intsrc.mpc_srcbus = 0;
+        intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+        intsrc.mpc_irqtype = mp_INT;
+        /*
+         *  If true, we have an ISA/PCI system with no IRQ entries
+         *  in the MP table. To prevent the PCI interrupts from being set up
+         *  incorrectly, we try to use the ELCR. The sanity check to see if
+         *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+         *  never be level sensitive, so we simply see if the ELCR agrees.
+         *  If it does, we assume it's valid.
+         */
+        if (mpc_default_type == 5) {
+                printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
+                if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
+                        printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
+                else {
+                        printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
+                        ELCR_fallback = 1;
+                }
+        }
+        for (i = 0; i < 16; i++) {
+                switch (mpc_default_type) {
+                case 2:
+                        if (i == 0 || i == 13)
+                                continue;       /* IRQ0 & IRQ13 not connected */
+                        /* fall through */
+                default:
+                        if (i == 2)
+                                continue;       /* IRQ2 is never connected */
+                }
+                if (ELCR_fallback) {
+                        /*
+                         *  If the ELCR indicates a level-sensitive interrupt, we
+                         *  copy that information over to the MP table in the
+                         *  irqflag field (level sensitive, active high polarity).
+                         */
+                        if (ELCR_trigger(i))
+                                intsrc.mpc_irqflag = 13;
+                        else
+                                intsrc.mpc_irqflag = 0;
+                }
+                intsrc.mpc_srcbusirq = i;
+                intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
+                MP_intsrc_info(&intsrc);
+        }
+        intsrc.mpc_irqtype = mp_ExtINT;
+        intsrc.mpc_srcbusirq = 0;
+        intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
+        MP_intsrc_info(&intsrc);
+}
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+        struct mpc_config_processor processor;
+        struct mpc_config_bus bus;
+        struct mpc_config_ioapic ioapic;
+        struct mpc_config_lintsrc lintsrc;
+        int linttypes[2] = { mp_ExtINT, mp_NMI };
+        int i;
+        /*
+         * local APIC has default address
+         */
+        mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+        /*
+         * 2 CPUs, numbered 0 & 1.
+         */
+        processor.mpc_type = MP_PROCESSOR;
+        /* Either an integrated APIC or a discrete 82489DX. */
+        processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+        processor.mpc_cpuflag = CPU_ENABLED;
+        processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+                                   (boot_cpu_data.x86_model << 4) |
+                                   boot_cpu_data.x86_mask;
+        processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+        processor.mpc_reserved[0] = 0;
+        processor.mpc_reserved[1] = 0;
+        for (i = 0; i < 2; i++) {
+                processor.mpc_apicid = i;
+                MP_processor_info(&processor);
+        }
+        bus.mpc_type = MP_BUS;
+        bus.mpc_busid = 0;
+        switch (mpc_default_type) {
+                default:
+                        printk("???\n");
+                        printk(KERN_ERR "Unknown standard configuration %d\n",
+                                mpc_default_type);
+                        /* fall through */
+                case 1:
+                case 5:
+                        memcpy(bus.mpc_bustype, "ISA   ", 6);
+                        break;
+                case 2:
+                case 6:
+                case 3:
+                        memcpy(bus.mpc_bustype, "EISA  ", 6);
+                        break;
+                case 4:
+                case 7:
+                        memcpy(bus.mpc_bustype, "MCA   ", 6);
+        }
+        MP_bus_info(&bus);
+        if (mpc_default_type > 4) {
+                bus.mpc_busid = 1;
+                memcpy(bus.mpc_bustype, "PCI   ", 6);
+                MP_bus_info(&bus);
+        }
+        ioapic.mpc_type = MP_IOAPIC;
+        ioapic.mpc_apicid = 2;
+        ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+        ioapic.mpc_flags = MPC_APIC_USABLE;
+        ioapic.mpc_apicaddr = 0xFEC00000;
+        MP_ioapic_info(&ioapic);
+        /*
+         * We set up most of the low 16 IO-APIC pins according to MPS rules.
+         */
+        construct_default_ioirq_mptable(mpc_default_type);
+        lintsrc.mpc_type = MP_LINTSRC;
+        lintsrc.mpc_irqflag = 0;                /* conforming */
+        lintsrc.mpc_srcbusid = 0;
+        lintsrc.mpc_srcbusirq = 0;
+        lintsrc.mpc_destapic = MP_APIC_ALL;
+        for (i = 0; i < 2; i++) {
+                lintsrc.mpc_irqtype = linttypes[i];
+                lintsrc.mpc_destapiclint = i;
+                MP_lintsrc_info(&lintsrc);
+        }
+}
+static struct intel_mp_floating *mpf_found;
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+void __init get_smp_config (void)
+{
+        struct intel_mp_floating *mpf = mpf_found;
+        /*
+         * ACPI supports both logical (e.g. Hyper-Threading) and physical 
+         * processors, where MPS only supports physical.
+         */
+        if (acpi_lapic && acpi_ioapic) {
+                printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
+                return;
+        }
+        else if (acpi_lapic)
+                printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
+        printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
+        if (mpf->mpf_feature2 & (1<<7)) {
+                printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
+                pic_mode = 1;
+        } else {
+                printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
+                pic_mode = 0;
+        }
+        /*
+         * Now see if we need to read further.
+         */
+        if (mpf->mpf_feature1 != 0) {
+                printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
+                construct_default_ISA_mptable(mpf->mpf_feature1);
+        } else if (mpf->mpf_physptr) {
+                /*
+                 * Read the physical hardware table.  Anything here will
+                 * override the defaults.
+                 */
+                if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
+                        smp_found_config = 0;
+                        printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
+                        printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
+                        return;
+                }
+                /*
+                 * If there are no explicit MP IRQ entries, then we are
+                 * broken.  We set up most of the low 16 IO-APIC pins to
+                 * ISA defaults and hope it will work.
+                 */
+                if (!mp_irq_entries) {
+                        struct mpc_config_bus bus;
+                        printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
+                        bus.mpc_type = MP_BUS;
+                        bus.mpc_busid = 0;
+                        memcpy(bus.mpc_bustype, "ISA   ", 6);
+                        MP_bus_info(&bus);
+                        construct_default_ioirq_mptable(0);
+                }
+        } else
+                BUG();
+        printk(KERN_INFO "Processors: %d\n", num_processors);
+        /*
+         * Only use the first configuration found.
+         */
+}
+static int __init smp_scan_config (unsigned long base, unsigned long length)
+{
+        unsigned long *bp = phys_to_virt(base);
+        struct intel_mp_floating *mpf;
+        Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
+        if (sizeof(*mpf) != 16)
+                printk("Error: MPF size\n");
+        while (length > 0) {
+                mpf = (struct intel_mp_floating *)bp;
+                if ((*bp == SMP_MAGIC_IDENT) &&
+                        (mpf->mpf_length == 1) &&
+                        !mpf_checksum((unsigned char *)bp, 16) &&
+                        ((mpf->mpf_specification == 1)
+                                || (mpf->mpf_specification == 4)) ) {
+                        smp_found_config = 1;
+                        printk(KERN_INFO "found SMP MP-table at %08lx\n",
+                                                virt_to_phys(mpf));
+                        reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
+                        if (mpf->mpf_physptr) {
+                                /*
+                                 * We cannot access to MPC table to compute
+                                 * table size yet, as only few megabytes from
+                                 * the bottom is mapped now.
+                                 * PC-9800's MPC table places on the very last
+                                 * of physical memory; so that simply reserving
+                                 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
+                                 * in reserve_bootmem.
+                                 */
+                                unsigned long size = PAGE_SIZE;
+                                unsigned long end = max_low_pfn * PAGE_SIZE;
+                                if (mpf->mpf_physptr + size > end)
+                                        size = end - mpf->mpf_physptr;
+                                reserve_bootmem(mpf->mpf_physptr, size);
+                        }
+                        mpf_found = mpf;
+                        return 1;
+                }
+                bp += 4;
+                length -= 16;
+        }
+        return 0;
+}
+void __init find_smp_config (void)
+{
+        unsigned int address;
+        /*
+         * FIXME: Linux assumes you have 640K of base ram..
+         * this continues the error...
+         *
+         * 1) Scan the bottom 1K for a signature
+         * 2) Scan the top 1K of base RAM
+         * 3) Scan the 64K of bios
+         */
+        if (smp_scan_config(0x0,0x400) ||
+                smp_scan_config(639*0x400,0x400) ||
+                        smp_scan_config(0xF0000,0x10000))
+                return;
+        /*
+         * If it is an SMP machine we should know now, unless the
+         * configuration is in an EISA/MCA bus machine with an
+         * extended bios data area.
+         *
+         * there is a real-mode segmented pointer pointing to the
+         * 4K EBDA area at 0x40E, calculate and scan it here.
+         *
+         * NOTE! There are Linux loaders that will corrupt the EBDA
+         * area, and as such this kind of SMP config may be less
+         * trustworthy, simply because the SMP table may have been
+         * stomped on during early boot. These loaders are buggy and
+         * should be fixed.
+         *
+         * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
+         */
+        address = get_bios_ebda();
+        if (address)
+                smp_scan_config(address, 0x400);
+}
+int es7000_plat;
+/* --------------------------------------------------------------------------
+                            ACPI-based MP Configuration
+   -------------------------------------------------------------------------- */
+#ifdef CONFIG_ACPI
+void __init mp_register_lapic_address(u64 address)
+{
+        mp_lapic_addr = (unsigned long) address;
+        set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+        if (boot_cpu_physical_apicid == -1U)
+                boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+        Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
+}
+void __cpuinit mp_register_lapic (u8 id, u8 enabled)
+{
+        struct mpc_config_processor processor;
+        int boot_cpu = 0;
+        
+        if (MAX_APICS - id <= 0) {
+                printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
+                        id, MAX_APICS);
+                return;
+        }
+        if (id == boot_cpu_physical_apicid)
+                boot_cpu = 1;
+        processor.mpc_type = MP_PROCESSOR;
+        processor.mpc_apicid = id;
+        processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
+        processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
+        processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
+        processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
+                (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+        processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+        processor.mpc_reserved[0] = 0;
+        processor.mpc_reserved[1] = 0;
+        MP_processor_info(&processor);
+}
+#ifdef  CONFIG_X86_IO_APIC
+#define MP_ISA_BUS              0
+#define MP_MAX_IOAPIC_PIN       127
+static struct mp_ioapic_routing {
+        int                     apic_id;
+        int                     gsi_base;
+        int                     gsi_end;
+        u32                     pin_programmed[4];
+} mp_ioapic_routing[MAX_IO_APICS];
+static int mp_find_ioapic (int gsi)
+{
+        int i = 0;
+        /* Find the IOAPIC that manages this GSI. */
+        for (i = 0; i < nr_ioapics; i++) {
+                if ((gsi >= mp_ioapic_routing[i].gsi_base)
+                        && (gsi <= mp_ioapic_routing[i].gsi_end))
+                        return i;
+        }
+        printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+        return -1;
+}
+void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
+{
+        int idx = 0;
+        int tmpid;
+        if (nr_ioapics >= MAX_IO_APICS) {
+                printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+                        "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+                panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+        }
+        if (!address) {
+                printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+                        " found in MADT table, skipping!\n");
+                return;
+        }
+        idx = nr_ioapics++;
+        mp_ioapics[idx].mpc_type = MP_IOAPIC;
+        mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
+        mp_ioapics[idx].mpc_apicaddr = address;
+        set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+        if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+                && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+                tmpid = io_apic_get_unique_id(idx, id);
+        else
+                tmpid = id;
+        if (tmpid == -1) {
+                nr_ioapics--;
+                return;
+        }
+        mp_ioapics[idx].mpc_apicid = tmpid;
+        mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+        
+        /* 
+         * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+         * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+         */
+        mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+        mp_ioapic_routing[idx].gsi_base = gsi_base;
+        mp_ioapic_routing[idx].gsi_end = gsi_base + 
+                io_apic_get_redir_entries(idx);
+        printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
+                "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
+                mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+                mp_ioapic_routing[idx].gsi_base,
+                mp_ioapic_routing[idx].gsi_end);
+}
+void __init
+mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+{
+        struct mpc_config_intsrc intsrc;
+        int                     ioapic = -1;
+        int                     pin = -1;
+        /* 
+         * Convert 'gsi' to 'ioapic.pin'.
+         */
+        ioapic = mp_find_ioapic(gsi);
+        if (ioapic < 0)
+                return;
+        pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+        /*
+         * TBD: This check is for faulty timer entries, where the override
+         *      erroneously sets the trigger to level, resulting in a HUGE 
+         *      increase of timer interrupts!
+         */
+        if ((bus_irq == 0) && (trigger == 3))
+                trigger = 1;
+        intsrc.mpc_type = MP_INTSRC;
+        intsrc.mpc_irqtype = mp_INT;
+        intsrc.mpc_irqflag = (trigger << 2) | polarity;
+        intsrc.mpc_srcbus = MP_ISA_BUS;
+        intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
+        intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
+        intsrc.mpc_dstirq = pin;                                    /* INTIN# */
+        Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
+                intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+                (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+                intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
+        mp_irqs[mp_irq_entries] = intsrc;
+        if (++mp_irq_entries == MAX_IRQ_SOURCES)
+                panic("Max # of irq sources exceeded!\n");
+}
+void __init mp_config_acpi_legacy_irqs (void)
+{
+        struct mpc_config_intsrc intsrc;
+        int i = 0;
+        int ioapic = -1;
+        /* 
+         * Fabricate the legacy ISA bus (bus #31).
+         */
+        mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+        Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+        /*
+         * Older generations of ES7000 have no legacy identity mappings
+         */
+        if (es7000_plat == 1)
+                return;
+        /* 
+         * Locate the IOAPIC that manages the ISA IRQs (0-15). 
+         */
+        ioapic = mp_find_ioapic(0);
+        if (ioapic < 0)
+                return;
+        intsrc.mpc_type = MP_INTSRC;
+        intsrc.mpc_irqflag = 0;                                 /* Conforming */
+        intsrc.mpc_srcbus = MP_ISA_BUS;
+        intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+        /* 
+         * Use the default configuration for the IRQs 0-15.  Unless
+         * overriden by (MADT) interrupt source override entries.
+         */
+        for (i = 0; i < 16; i++) {
+                int idx;
+                for (idx = 0; idx < mp_irq_entries; idx++) {
+                        struct mpc_config_intsrc *irq = mp_irqs + idx;
+                        /* Do we already have a mapping for this ISA IRQ? */
+                        if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
+                                break;
+                        /* Do we already have a mapping for this IOAPIC pin */
+                        if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
+                                (irq->mpc_dstirq == i))
+                                break;
+                }
+                if (idx != mp_irq_entries) {
+                        printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+                        continue;                       /* IRQ already used */
+                }
+                intsrc.mpc_irqtype = mp_INT;
+                intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
+                intsrc.mpc_dstirq = i;
+                Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
+                        "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+                        (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+                        intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
+                        intsrc.mpc_dstirq);
+                mp_irqs[mp_irq_entries] = intsrc;
+                if (++mp_irq_entries == MAX_IRQ_SOURCES)
+                        panic("Max # of irq sources exceeded!\n");
+        }
+}
+#define MAX_GSI_NUM     4096
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
+        int ioapic = -1;
+        int ioapic_pin = 0;
+        int idx, bit = 0;
+        static int pci_irq = 16;
+        /*
+         * Mapping between Global System Interrups, which
+         * represent all possible interrupts, and IRQs
+         * assigned to actual devices.
+         */
+        static int              gsi_to_irq[MAX_GSI_NUM];
+        /* Don't set up the ACPI SCI because it's already set up */
+        if (acpi_gbl_FADT.sci_interrupt == gsi)
+                return gsi;
+        ioapic = mp_find_ioapic(gsi);
+        if (ioapic < 0) {
+                printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+                return gsi;
+        }
+        ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+        if (ioapic_renumber_irq)
+                gsi = ioapic_renumber_irq(ioapic, gsi);
+        /* 
+         * Avoid pin reprogramming.  PRTs typically include entries  
+         * with redundant pin->gsi mappings (but unique PCI devices);
+         * we only program the IOAPIC on the first.
+         */
+        bit = ioapic_pin % 32;
+        idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
+        if (idx > 3) {
+                printk(KERN_ERR "Invalid reference to IOAPIC pin "
+                        "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
+                        ioapic_pin);
+                return gsi;
+        }
+        if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
+                Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+                        mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+                return gsi_to_irq[gsi];
+        }
+        mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
+        if (triggering == ACPI_LEVEL_SENSITIVE) {
+                /*
+                 * For PCI devices assign IRQs in order, avoiding gaps
+                 * due to unused I/O APIC pins.
+                 */
+                int irq = gsi;
+                if (gsi < MAX_GSI_NUM) {
+                        /*
+                         * Retain the VIA chipset work-around (gsi > 15), but
+                         * avoid a problem where the 8254 timer (IRQ0) is setup
+                         * via an override (so it's not on pin 0 of the ioapic),
+                         * and at the same time, the pin 0 interrupt is a PCI
+                         * type.  The gsi > 15 test could cause these two pins
+                         * to be shared as IRQ0, and they are not shareable.
+                         * So test for this condition, and if necessary, avoid
+                         * the pin collision.
+                         */
+                        if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
+                                gsi = pci_irq++;
+                        /*
+                         * Don't assign IRQ used by ACPI SCI
+                         */
+                        if (gsi == acpi_gbl_FADT.sci_interrupt)
+                                gsi = pci_irq++;
+                        gsi_to_irq[irq] = gsi;
+                } else {
+                        printk(KERN_ERR "GSI %u is too high\n", gsi);
+                        return gsi;
+                }
+        }
+        io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+                    triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+                    polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+        return gsi;
+}
+#endif /* CONFIG_X86_IO_APIC */
+#endif /* CONFIG_ACPI */
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
new file mode 100644
index 000000000000..8bf0ca03ac8e
--- /dev/null
+++ b/arch/x86/kernel/mpparse_64.c
@@ -0,0 +1,852 @@
+/*
+ *      Intel Multiprocessor Specification 1.1 and 1.4
+ *      compliant MP-table parsing routines.
+ *
+ *      (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *      (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *      Fixes
+ *              Erich Boleyn    :       MP v1.4 and additional changes.
+ *              Alan Cox        :       Added EBDA scanning
+ *              Ingo Molnar     :       various cleanups and rewrites
+ *              Maciej W. Rozycki:      Bits for default MP configurations
+ *              Paul Diefenbaugh:       Added full ACPI support
+ */
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+#include <asm/acpi.h>
+/* Have we found an MP table */
+int smp_found_config;
+/*
+ * Various Linux-internal data structures created from the
+ * MP-table.
+ */
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+static int mp_current_pci_id = 0;
+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+/* # of MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+/* MP IRQ source entries */
+int mp_irq_entries;
+int nr_ioapics;
+unsigned long mp_lapic_addr = 0;
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_id = -1U;
+/* Internal processor count */
+unsigned int num_processors __cpuinitdata = 0;
+unsigned disabled_cpus __cpuinitdata;
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
+u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+/*
+ * Intel MP BIOS table parsing routines:
+ */
+/*
+ * Checksum an MP configuration block.
+ */
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+        int sum = 0;
+        while (len--)
+                sum += *mp++;
+        return sum & 0xFF;
+}
+static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
+{
+        int cpu;
+        cpumask_t tmp_map;
+        char *bootup_cpu = "";
+        if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+                disabled_cpus++;
+                return;
+        }
+        if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+                bootup_cpu = " (Bootup-CPU)";
+                boot_cpu_id = m->mpc_apicid;
+        }
+        printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
+        if (num_processors >= NR_CPUS) {
+                printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+                        " Processor ignored.\n", NR_CPUS);
+                return;
+        }
+        num_processors++;
+        cpus_complement(tmp_map, cpu_present_map);
+        cpu = first_cpu(tmp_map);
+        physid_set(m->mpc_apicid, phys_cpu_present_map);
+        if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+                /*
+                 * bios_cpu_apicid is required to have processors listed
+                 * in same order as logical cpu numbers. Hence the first
+                 * entry is BSP, and so on.
+                 */
+                cpu = 0;
+        }
+        bios_cpu_apicid[cpu] = m->mpc_apicid;
+        x86_cpu_to_apicid[cpu] = m->mpc_apicid;
+        cpu_set(cpu, cpu_possible_map);
+        cpu_set(cpu, cpu_present_map);
+}
+static void __init MP_bus_info (struct mpc_config_bus *m)
+{
+        char str[7];
+        memcpy(str, m->mpc_bustype, 6);
+        str[6] = 0;
+        Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+        if (strncmp(str, "ISA", 3) == 0) {
+                set_bit(m->mpc_busid, mp_bus_not_pci);
+        } else if (strncmp(str, "PCI", 3) == 0) {
+                clear_bit(m->mpc_busid, mp_bus_not_pci);
+                mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
+                mp_current_pci_id++;
+        } else {
+                printk(KERN_ERR "Unknown bustype %s\n", str);
+        }
+}
+static int bad_ioapic(unsigned long address)
+{
+        if (nr_ioapics >= MAX_IO_APICS) {
+                printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+                        "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+                panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+        }
+        if (!address) {
+                printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+                        " found in table, skipping!\n");
+                return 1;
+        }
+        return 0;
+}
+static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
+{
+        if (!(m->mpc_flags & MPC_APIC_USABLE))
+                return;
+        printk("I/O APIC #%d at 0x%X.\n",
+                m->mpc_apicid, m->mpc_apicaddr);
+        if (bad_ioapic(m->mpc_apicaddr))
+                return;
+        mp_ioapics[nr_ioapics] = *m;
+        nr_ioapics++;
+}
+static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
+{
+        mp_irqs [mp_irq_entries] = *m;
+        Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+                " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+                        m->mpc_irqtype, m->mpc_irqflag & 3,
+                        (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
+                        m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+        if (++mp_irq_entries >= MAX_IRQ_SOURCES)
+                panic("Max # of irq sources exceeded!!\n");
+}
+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
+{
+        Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+                " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+                        m->mpc_irqtype, m->mpc_irqflag & 3,
+                        (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
+                        m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+}
+/*
+ * Read/parse the MPC
+ */
+static int __init smp_read_mpc(struct mp_config_table *mpc)
+{
+        char str[16];
+        int count=sizeof(*mpc);
+        unsigned char *mpt=((unsigned char *)mpc)+count;
+        if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
+                printk("MPTABLE: bad signature [%c%c%c%c]!\n",
+                        mpc->mpc_signature[0],
+                        mpc->mpc_signature[1],
+                        mpc->mpc_signature[2],
+                        mpc->mpc_signature[3]);
+                return 0;
+        }
+        if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
+                printk("MPTABLE: checksum error!\n");
+                return 0;
+        }
+        if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
+                printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
+                        mpc->mpc_spec);
+                return 0;
+        }
+        if (!mpc->mpc_lapic) {
+                printk(KERN_ERR "MPTABLE: null local APIC address!\n");
+                return 0;
+        }
+        memcpy(str,mpc->mpc_oem,8);
+        str[8] = 0;
+        printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
+        memcpy(str,mpc->mpc_productid,12);
+        str[12] = 0;
+        printk("MPTABLE: Product ID: %s ",str);
+        printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
+        /* save the local APIC address, it might be non-default */
+        if (!acpi_lapic)
+                mp_lapic_addr = mpc->mpc_lapic;
+        /*
+         *      Now process the configuration blocks.
+         */
+        while (count < mpc->mpc_length) {
+                switch(*mpt) {
+                        case MP_PROCESSOR:
+                        {
+                                struct mpc_config_processor *m=
+                                        (struct mpc_config_processor *)mpt;
+                                if (!acpi_lapic)
+                                        MP_processor_info(m);
+                                mpt += sizeof(*m);
+                                count += sizeof(*m);
+                                break;
+                        }
+                        case MP_BUS:
+                        {
+                                struct mpc_config_bus *m=
+                                        (struct mpc_config_bus *)mpt;
+                                MP_bus_info(m);
+                                mpt += sizeof(*m);
+                                count += sizeof(*m);
+                                break;
+                        }
+                        case MP_IOAPIC:
+                        {
+                                struct mpc_config_ioapic *m=
+                                        (struct mpc_config_ioapic *)mpt;
+                                MP_ioapic_info(m);
+                                mpt += sizeof(*m);
+                                count += sizeof(*m);
+                                break;
+                        }
+                        case MP_INTSRC:
+                        {
+                                struct mpc_config_intsrc *m=
+                                        (struct mpc_config_intsrc *)mpt;
+                                MP_intsrc_info(m);
+                                mpt += sizeof(*m);
+                                count += sizeof(*m);
+                                break;
+                        }
+                        case MP_LINTSRC:
+                        {
+                                struct mpc_config_lintsrc *m=
+                                        (struct mpc_config_lintsrc *)mpt;
+                                MP_lintsrc_info(m);
+                                mpt += sizeof(*m);
+                                count += sizeof(*m);
+                                break;
+                        }
+                }
+        }
+        setup_apic_routing();
+        if (!num_processors)
+                printk(KERN_ERR "MPTABLE: no processors registered!\n");
+        return num_processors;
+}
+static int __init ELCR_trigger(unsigned int irq)
+{
+        unsigned int port;
+        port = 0x4d0 + (irq >> 3);
+        return (inb(port) >> (irq & 7)) & 1;
+}
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+        struct mpc_config_intsrc intsrc;
+        int i;
+        int ELCR_fallback = 0;
+        intsrc.mpc_type = MP_INTSRC;
+        intsrc.mpc_irqflag = 0;                 /* conforming */
+        intsrc.mpc_srcbus = 0;
+        intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+        intsrc.mpc_irqtype = mp_INT;
+        /*
+         *  If true, we have an ISA/PCI system with no IRQ entries
+         *  in the MP table. To prevent the PCI interrupts from being set up
+         *  incorrectly, we try to use the ELCR. The sanity check to see if
+         *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+         *  never be level sensitive, so we simply see if the ELCR agrees.
+         *  If it does, we assume it's valid.
+         */
+        if (mpc_default_type == 5) {
+                printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
+                if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
+                        printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
+                else {
+                        printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
+                        ELCR_fallback = 1;
+                }
+        }
+        for (i = 0; i < 16; i++) {
+                switch (mpc_default_type) {
+                case 2:
+                        if (i == 0 || i == 13)
+                                continue;       /* IRQ0 & IRQ13 not connected */
+                        /* fall through */
+                default:
+                        if (i == 2)
+                                continue;       /* IRQ2 is never connected */
+                }
+                if (ELCR_fallback) {
+                        /*
+                         *  If the ELCR indicates a level-sensitive interrupt, we
+                         *  copy that information over to the MP table in the
+                         *  irqflag field (level sensitive, active high polarity).
+                         */
+                        if (ELCR_trigger(i))
+                                intsrc.mpc_irqflag = 13;
+                        else
+                                intsrc.mpc_irqflag = 0;
+                }
+                intsrc.mpc_srcbusirq = i;
+                intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
+                MP_intsrc_info(&intsrc);
+        }
+        intsrc.mpc_irqtype = mp_ExtINT;
+        intsrc.mpc_srcbusirq = 0;
+        intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
+        MP_intsrc_info(&intsrc);
+}
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+        struct mpc_config_processor processor;
+        struct mpc_config_bus bus;
+        struct mpc_config_ioapic ioapic;
+        struct mpc_config_lintsrc lintsrc;
+        int linttypes[2] = { mp_ExtINT, mp_NMI };
+        int i;
+        /*
+         * local APIC has default address
+         */
+        mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+        /*
+         * 2 CPUs, numbered 0 & 1.
+         */
+        processor.mpc_type = MP_PROCESSOR;
+        processor.mpc_apicver = 0;
+        processor.mpc_cpuflag = CPU_ENABLED;
+        processor.mpc_cpufeature = 0;
+        processor.mpc_featureflag = 0;
+        processor.mpc_reserved[0] = 0;
+        processor.mpc_reserved[1] = 0;
+        for (i = 0; i < 2; i++) {
+                processor.mpc_apicid = i;
+                MP_processor_info(&processor);
+        }
+        bus.mpc_type = MP_BUS;
+        bus.mpc_busid = 0;
+        switch (mpc_default_type) {
+                default:
+                        printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+                                mpc_default_type);
+                        /* fall through */
+                case 1:
+                case 5:
+                        memcpy(bus.mpc_bustype, "ISA   ", 6);
+                        break;
+        }
+        MP_bus_info(&bus);
+        if (mpc_default_type > 4) {
+                bus.mpc_busid = 1;
+                memcpy(bus.mpc_bustype, "PCI   ", 6);
+                MP_bus_info(&bus);
+        }
+        ioapic.mpc_type = MP_IOAPIC;
+        ioapic.mpc_apicid = 2;
+        ioapic.mpc_apicver = 0;
+        ioapic.mpc_flags = MPC_APIC_USABLE;
+        ioapic.mpc_apicaddr = 0xFEC00000;
+        MP_ioapic_info(&ioapic);
+        /*
+         * We set up most of the low 16 IO-APIC pins according to MPS rules.
+         */
+        construct_default_ioirq_mptable(mpc_default_type);
+        lintsrc.mpc_type = MP_LINTSRC;
+        lintsrc.mpc_irqflag = 0;                /* conforming */
+        lintsrc.mpc_srcbusid = 0;
+        lintsrc.mpc_srcbusirq = 0;
+        lintsrc.mpc_destapic = MP_APIC_ALL;
+        for (i = 0; i < 2; i++) {
+                lintsrc.mpc_irqtype = linttypes[i];
+                lintsrc.mpc_destapiclint = i;
+                MP_lintsrc_info(&lintsrc);
+        }
+}
+static struct intel_mp_floating *mpf_found;
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+void __init get_smp_config (void)
+{
+        struct intel_mp_floating *mpf = mpf_found;
+        /*
+         * ACPI supports both logical (e.g. Hyper-Threading) and physical 
+         * processors, where MPS only supports physical.
+         */
+        if (acpi_lapic && acpi_ioapic) {
+                printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
+                return;
+        }
+        else if (acpi_lapic)
+                printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
+        printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
+        /*
+         * Now see if we need to read further.
+         */
+        if (mpf->mpf_feature1 != 0) {
+                printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
+                construct_default_ISA_mptable(mpf->mpf_feature1);
+        } else if (mpf->mpf_physptr) {
+                /*
+                 * Read the physical hardware table.  Anything here will
+                 * override the defaults.
+                 */
+                if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
+                        smp_found_config = 0;
+                        printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
+                        printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
+                        return;
+                }
+                /*
+                 * If there are no explicit MP IRQ entries, then we are
+                 * broken.  We set up most of the low 16 IO-APIC pins to
+                 * ISA defaults and hope it will work.
+                 */
+                if (!mp_irq_entries) {
+                        struct mpc_config_bus bus;
+                        printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
+                        bus.mpc_type = MP_BUS;
+                        bus.mpc_busid = 0;
+                        memcpy(bus.mpc_bustype, "ISA   ", 6);
+                        MP_bus_info(&bus);
+                        construct_default_ioirq_mptable(0);
+                }
+        } else
+                BUG();
+        printk(KERN_INFO "Processors: %d\n", num_processors);
+        /*
+         * Only use the first configuration found.
+         */
+}
+static int __init smp_scan_config (unsigned long base, unsigned long length)
+{
+        extern void __bad_mpf_size(void); 
+        unsigned int *bp = phys_to_virt(base);
+        struct intel_mp_floating *mpf;
+        Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
+        if (sizeof(*mpf) != 16)
+                __bad_mpf_size();
+        while (length > 0) {
+                mpf = (struct intel_mp_floating *)bp;
+                if ((*bp == SMP_MAGIC_IDENT) &&
+                        (mpf->mpf_length == 1) &&
+                        !mpf_checksum((unsigned char *)bp, 16) &&
+                        ((mpf->mpf_specification == 1)
+                                || (mpf->mpf_specification == 4)) ) {
+                        smp_found_config = 1;
+                        reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
+                        if (mpf->mpf_physptr)
+                                reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE);
+                        mpf_found = mpf;
+                        return 1;
+                }
+                bp += 4;
+                length -= 16;
+        }
+        return 0;
+}
+void __init find_smp_config(void)
+{
+        unsigned int address;
+        /*
+         * FIXME: Linux assumes you have 640K of base ram..
+         * this continues the error...
+         *
+         * 1) Scan the bottom 1K for a signature
+         * 2) Scan the top 1K of base RAM
+         * 3) Scan the 64K of bios
+         */
+        if (smp_scan_config(0x0,0x400) ||
+                smp_scan_config(639*0x400,0x400) ||
+                        smp_scan_config(0xF0000,0x10000))
+                return;
+        /*
+         * If it is an SMP machine we should know now.
+         *
+         * there is a real-mode segmented pointer pointing to the
+         * 4K EBDA area at 0x40E, calculate and scan it here.
+         *
+         * NOTE! There are Linux loaders that will corrupt the EBDA
+         * area, and as such this kind of SMP config may be less
+         * trustworthy, simply because the SMP table may have been
+         * stomped on during early boot. These loaders are buggy and
+         * should be fixed.
+         */
+        address = *(unsigned short *)phys_to_virt(0x40E);
+        address <<= 4;
+        if (smp_scan_config(address, 0x1000))
+                return;
+        /* If we have come this far, we did not find an MP table  */
+         printk(KERN_INFO "No mptable found.\n");
+}
+/* --------------------------------------------------------------------------
+                            ACPI-based MP Configuration
+   -------------------------------------------------------------------------- */
+#ifdef CONFIG_ACPI
+void __init mp_register_lapic_address(u64 address)
+{
+        mp_lapic_addr = (unsigned long) address;
+        set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+        if (boot_cpu_id == -1U)
+                boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
+}
+void __cpuinit mp_register_lapic (u8 id, u8 enabled)
+{
+        struct mpc_config_processor processor;
+        int                     boot_cpu = 0;
+        
+        if (id == boot_cpu_id)
+                boot_cpu = 1;
+        processor.mpc_type = MP_PROCESSOR;
+        processor.mpc_apicid = id;
+        processor.mpc_apicver = 0;
+        processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
+        processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
+        processor.mpc_cpufeature = 0;
+        processor.mpc_featureflag = 0;
+        processor.mpc_reserved[0] = 0;
+        processor.mpc_reserved[1] = 0;
+        MP_processor_info(&processor);
+}
+#define MP_ISA_BUS              0
+#define MP_MAX_IOAPIC_PIN       127
+static struct mp_ioapic_routing {
+        int                     apic_id;
+        int                     gsi_start;
+        int                     gsi_end;
+        u32                     pin_programmed[4];
+} mp_ioapic_routing[MAX_IO_APICS];
+static int mp_find_ioapic(int gsi)
+{
+        int i = 0;
+        /* Find the IOAPIC that manages this GSI. */
+        for (i = 0; i < nr_ioapics; i++) {
+                if ((gsi >= mp_ioapic_routing[i].gsi_start)
+                        && (gsi <= mp_ioapic_routing[i].gsi_end))
+                        return i;
+        }
+        printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+        return -1;
+}
+static u8 uniq_ioapic_id(u8 id)
+{
+        int i;
+        DECLARE_BITMAP(used, 256);
+        bitmap_zero(used, 256);
+        for (i = 0; i < nr_ioapics; i++) {
+                struct mpc_config_ioapic *ia = &mp_ioapics[i];
+                __set_bit(ia->mpc_apicid, used);
+        }
+        if (!test_bit(id, used))
+                return id;
+        return find_first_zero_bit(used, 256);
+}
+void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
+{
+        int idx = 0;
+        if (bad_ioapic(address))
+                return;
+        idx = nr_ioapics;
+        mp_ioapics[idx].mpc_type = MP_IOAPIC;
+        mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
+        mp_ioapics[idx].mpc_apicaddr = address;
+        set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+        mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
+        mp_ioapics[idx].mpc_apicver = 0;
+        
+        /* 
+         * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
+         * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
+         */
+        mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+        mp_ioapic_routing[idx].gsi_start = gsi_base;
+        mp_ioapic_routing[idx].gsi_end = gsi_base + 
+                io_apic_get_redir_entries(idx);
+        printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
+                "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
+                mp_ioapics[idx].mpc_apicaddr,
+                mp_ioapic_routing[idx].gsi_start,
+                mp_ioapic_routing[idx].gsi_end);
+        nr_ioapics++;
+}
+void __init
+mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+{
+        struct mpc_config_intsrc intsrc;
+        int                     ioapic = -1;
+        int                     pin = -1;
+        /* 
+         * Convert 'gsi' to 'ioapic.pin'.
+         */
+        ioapic = mp_find_ioapic(gsi);
+        if (ioapic < 0)
+                return;
+        pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
+        /*
+         * TBD: This check is for faulty timer entries, where the override
+         *      erroneously sets the trigger to level, resulting in a HUGE 
+         *      increase of timer interrupts!
+         */
+        if ((bus_irq == 0) && (trigger == 3))
+                trigger = 1;
+        intsrc.mpc_type = MP_INTSRC;
+        intsrc.mpc_irqtype = mp_INT;
+        intsrc.mpc_irqflag = (trigger << 2) | polarity;
+        intsrc.mpc_srcbus = MP_ISA_BUS;
+        intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
+        intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
+        intsrc.mpc_dstirq = pin;                                    /* INTIN# */
+        Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", 
+                intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+                (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+                intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
+        mp_irqs[mp_irq_entries] = intsrc;
+        if (++mp_irq_entries == MAX_IRQ_SOURCES)
+                panic("Max # of irq sources exceeded!\n");
+}
+void __init mp_config_acpi_legacy_irqs(void)
+{
+        struct mpc_config_intsrc intsrc;
+        int i = 0;
+        int ioapic = -1;
+        /* 
+         * Fabricate the legacy ISA bus (bus #31).
+         */
+        set_bit(MP_ISA_BUS, mp_bus_not_pci);
+        /* 
+         * Locate the IOAPIC that manages the ISA IRQs (0-15). 
+         */
+        ioapic = mp_find_ioapic(0);
+        if (ioapic < 0)
+                return;
+        intsrc.mpc_type = MP_INTSRC;
+        intsrc.mpc_irqflag = 0;                                 /* Conforming */
+        intsrc.mpc_srcbus = MP_ISA_BUS;
+        intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+        /* 
+         * Use the default configuration for the IRQs 0-15.  Unless
+         * overridden by (MADT) interrupt source override entries.
+         */
+        for (i = 0; i < 16; i++) {
+                int idx;
+                for (idx = 0; idx < mp_irq_entries; idx++) {
+                        struct mpc_config_intsrc *irq = mp_irqs + idx;
+                        /* Do we already have a mapping for this ISA IRQ? */
+                        if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
+                                break;
+                        /* Do we already have a mapping for this IOAPIC pin */
+                        if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
+                                (irq->mpc_dstirq == i))
+                                break;
+                }
+                if (idx != mp_irq_entries) {
+                        printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+                        continue;                       /* IRQ already used */
+                }
+                intsrc.mpc_irqtype = mp_INT;
+                intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
+                intsrc.mpc_dstirq = i;
+                Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
+                        "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+                        (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+                        intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
+                        intsrc.mpc_dstirq);
+                mp_irqs[mp_irq_entries] = intsrc;
+                if (++mp_irq_entries == MAX_IRQ_SOURCES)
+                        panic("Max # of irq sources exceeded!\n");
+        }
+}
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
+        int ioapic = -1;
+        int ioapic_pin = 0;
+        int idx, bit = 0;
+        if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+                return gsi;
+        /* Don't set up the ACPI SCI because it's already set up */
+        if (acpi_gbl_FADT.sci_interrupt == gsi)
+                return gsi;
+        ioapic = mp_find_ioapic(gsi);
+        if (ioapic < 0) {
+                printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+                return gsi;
+        }
+        ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
+        /* 
+         * Avoid pin reprogramming.  PRTs typically include entries  
+         * with redundant pin->gsi mappings (but unique PCI devices);
+         * we only program the IOAPIC on the first.
+         */
+        bit = ioapic_pin % 32;
+        idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
+        if (idx > 3) {
+                printk(KERN_ERR "Invalid reference to IOAPIC pin "
+                        "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
+                        ioapic_pin);
+                return gsi;
+        }
+        if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
+                Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+                        mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+                return gsi;
+        }
+        mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
+        io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+                triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+                polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+        return gsi;
+}
+#endif /*CONFIG_ACPI*/
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
new file mode 100644
index 000000000000..0c1069b8d638
--- /dev/null
+++ b/arch/x86/kernel/msr.c
@@ -0,0 +1,224 @@
+/* ----------------------------------------------------------------------- *
+ *   
+ *   Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * msr.c
+ *
+ * x86 MSR access device
+ *
+ * This device is accessed by lseek() to the appropriate register number
+ * and then read/write in chunks of 8 bytes.  A larger size means multiple
+ * reads or writes of the same register.
+ *
+ * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
+ * an SMP box will direct the access to CPU %d.
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+static struct class *msr_class;
+static loff_t msr_seek(struct file *file, loff_t offset, int orig)
+{
+        loff_t ret = -EINVAL;
+        lock_kernel();
+        switch (orig) {
+        case 0:
+                file->f_pos = offset;
+                ret = file->f_pos;
+                break;
+        case 1:
+                file->f_pos += offset;
+                ret = file->f_pos;
+        }
+        unlock_kernel();
+        return ret;
+}
+static ssize_t msr_read(struct file *file, char __user * buf,
+                        size_t count, loff_t * ppos)
+{
+        u32 __user *tmp = (u32 __user *) buf;
+        u32 data[2];
+        u32 reg = *ppos;
+        int cpu = iminor(file->f_path.dentry->d_inode);
+        int err;
+        if (count % 8)
+                return -EINVAL; /* Invalid chunk size */
+        for (; count; count -= 8) {
+                err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
+                if (err)
+                        return -EIO;
+                if (copy_to_user(tmp, &data, 8))
+                        return -EFAULT;
+                tmp += 2;
+        }
+        return ((char __user *)tmp) - buf;
+}
+static ssize_t msr_write(struct file *file, const char __user *buf,
+                         size_t count, loff_t *ppos)
+{
+        const u32 __user *tmp = (const u32 __user *)buf;
+        u32 data[2];
+        u32 reg = *ppos;
+        int cpu = iminor(file->f_path.dentry->d_inode);
+        int err;
+        if (count % 8)
+                return -EINVAL; /* Invalid chunk size */
+        for (; count; count -= 8) {
+                if (copy_from_user(&data, tmp, 8))
+                        return -EFAULT;
+                err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
+                if (err)
+                        return -EIO;
+                tmp += 2;
+        }
+        return ((char __user *)tmp) - buf;
+}
+static int msr_open(struct inode *inode, struct file *file)
+{
+        unsigned int cpu = iminor(file->f_path.dentry->d_inode);
+        struct cpuinfo_x86 *c = &(cpu_data)[cpu];
+        if (cpu >= NR_CPUS || !cpu_online(cpu))
+                return -ENXIO;  /* No such CPU */
+        if (!cpu_has(c, X86_FEATURE_MSR))
+                return -EIO;    /* MSR not supported */
+        return 0;
+}
+/*
+ * File operations we support
+ */
+static const struct file_operations msr_fops = {
+        .owner = THIS_MODULE,
+        .llseek = msr_seek,
+        .read = msr_read,
+        .write = msr_write,
+        .open = msr_open,
+};
+static int msr_device_create(int i)
+{
+        int err = 0;
+        struct device *dev;
+        dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i);
+        if (IS_ERR(dev))
+                err = PTR_ERR(dev);
+        return err;
+}
+static int msr_class_cpu_callback(struct notifier_block *nfb,
+                                unsigned long action, void *hcpu)
+{
+        unsigned int cpu = (unsigned long)hcpu;
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                msr_device_create(cpu);
+                break;
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
+{
+        .notifier_call = msr_class_cpu_callback,
+};
+static int __init msr_init(void)
+{
+        int i, err = 0;
+        i = 0;
+        if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
+                printk(KERN_ERR "msr: unable to get major %d for msr\n",
+                       MSR_MAJOR);
+                err = -EBUSY;
+                goto out;
+        }
+        msr_class = class_create(THIS_MODULE, "msr");
+        if (IS_ERR(msr_class)) {
+                err = PTR_ERR(msr_class);
+                goto out_chrdev;
+        }
+        for_each_online_cpu(i) {
+                err = msr_device_create(i);
+                if (err != 0)
+                        goto out_class;
+        }
+        register_hotcpu_notifier(&msr_class_cpu_notifier);
+        err = 0;
+        goto out;
+out_class:
+        i = 0;
+        for_each_online_cpu(i)
+                device_destroy(msr_class, MKDEV(MSR_MAJOR, i));
+        class_destroy(msr_class);
+out_chrdev:
+        unregister_chrdev(MSR_MAJOR, "cpu/msr");
+out:
+        return err;
+}
+static void __exit msr_exit(void)
+{
+        int cpu = 0;
+        for_each_online_cpu(cpu)
+                device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
+        class_destroy(msr_class);
+        unregister_chrdev(MSR_MAJOR, "cpu/msr");
+        unregister_hotcpu_notifier(&msr_class_cpu_notifier);
+}
+module_init(msr_init);
+module_exit(msr_exit)
+MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
+MODULE_DESCRIPTION("x86 generic MSR driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
new file mode 100644
index 000000000000..c7227e2180f8
--- /dev/null
+++ b/arch/x86/kernel/nmi_32.c
@@ -0,0 +1,468 @@
+/*
+ *  linux/arch/i386/nmi.c
+ *
+ *  NMI watchdog support on APIC systems
+ *
+ *  Started by Ingo Molnar <mingo@redhat.com>
+ *
+ *  Fixes:
+ *  Mikael Pettersson   : AMD K7 support for local APIC NMI watchdog.
+ *  Mikael Pettersson   : Power Management for local APIC NMI watchdog.
+ *  Mikael Pettersson   : Pentium 4 support for local APIC NMI watchdog.
+ *  Pavel Machek and
+ *  Mikael Pettersson   : PM converted to driver model. Disable/enable API.
+ */
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/sysdev.h>
+#include <linux/sysctl.h>
+#include <linux/percpu.h>
+#include <linux/kprobes.h>
+#include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
+#include <linux/kdebug.h>
+#include <asm/smp.h>
+#include <asm/nmi.h>
+#include "mach_traps.h"
+int unknown_nmi_panic;
+int nmi_watchdog_enabled;
+static cpumask_t backtrace_mask = CPU_MASK_NONE;
+/* nmi_active:
+ * >0: the lapic NMI watchdog is active, but can be disabled
+ * <0: the lapic NMI watchdog has not been set up, and cannot
+ *     be enabled
+ *  0: the lapic NMI watchdog is disabled, but can be enabled
+ */
+atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
+unsigned int nmi_watchdog = NMI_DEFAULT;
+static unsigned int nmi_hz = HZ;
+static DEFINE_PER_CPU(short, wd_enabled);
+/* local prototypes */
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
+static int endflag __initdata = 0;
+#ifdef CONFIG_SMP
+/* The performance counters used by NMI_LOCAL_APIC don't trigger when
+ * the CPU is idle. To make sure the NMI watchdog really ticks on all
+ * CPUs during the test make them busy.
+ */
+static __init void nmi_cpu_busy(void *data)
+{
+        local_irq_enable_in_hardirq();
+        /* Intentionally don't use cpu_relax here. This is
+           to make sure that the performance counter really ticks,
+           even if there is a simulator or similar that catches the
+           pause instruction. On a real HT machine this is fine because
+           all other CPUs are busy with "useless" delay loops and don't
+           care if they get somewhat less cycles. */
+        while (endflag == 0)
+                mb();
+}
+#endif
+static int __init check_nmi_watchdog(void)
+{
+        unsigned int *prev_nmi_count;
+        int cpu;
+        if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
+                return 0;
+        if (!atomic_read(&nmi_active))
+                return 0;
+        prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+        if (!prev_nmi_count)
+                return -1;
+        printk(KERN_INFO "Testing NMI watchdog ... ");
+        if (nmi_watchdog == NMI_LOCAL_APIC)
+                smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+        for_each_possible_cpu(cpu)
+                prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
+        local_irq_enable();
+        mdelay((20*1000)/nmi_hz); // wait 20 ticks
+        for_each_possible_cpu(cpu) {
+#ifdef CONFIG_SMP
+                /* Check cpu_callin_map here because that is set
+                   after the timer is started. */
+                if (!cpu_isset(cpu, cpu_callin_map))
+                        continue;
+#endif
+                if (!per_cpu(wd_enabled, cpu))
+                        continue;
+                if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
+                        printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
+                                cpu,
+                                prev_nmi_count[cpu],
+                                nmi_count(cpu));
+                        per_cpu(wd_enabled, cpu) = 0;
+                        atomic_dec(&nmi_active);
+                }
+        }
+        endflag = 1;
+        if (!atomic_read(&nmi_active)) {
+                kfree(prev_nmi_count);
+                atomic_set(&nmi_active, -1);
+                return -1;
+        }
+        printk("OK.\n");
+        /* now that we know it works we can reduce NMI frequency to
+           something more reasonable; makes a difference in some configs */
+        if (nmi_watchdog == NMI_LOCAL_APIC)
+                nmi_hz = lapic_adjust_nmi_hz(1);
+        kfree(prev_nmi_count);
+        return 0;
+}
+/* This needs to happen later in boot so counters are working */
+late_initcall(check_nmi_watchdog);
+static int __init setup_nmi_watchdog(char *str)
+{
+        int nmi;
+        get_option(&str, &nmi);
+        if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
+                return 0;
+        nmi_watchdog = nmi;
+        return 1;
+}
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+/* Suspend/resume support */
+#ifdef CONFIG_PM
+static int nmi_pm_active; /* nmi_active before suspend */
+static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
+{
+        /* only CPU0 goes here, other CPUs should be offline */
+        nmi_pm_active = atomic_read(&nmi_active);
+        stop_apic_nmi_watchdog(NULL);
+        BUG_ON(atomic_read(&nmi_active) != 0);
+        return 0;
+}
+static int lapic_nmi_resume(struct sys_device *dev)
+{
+        /* only CPU0 goes here, other CPUs should be offline */
+        if (nmi_pm_active > 0) {
+                setup_apic_nmi_watchdog(NULL);
+                touch_nmi_watchdog();
+        }
+        return 0;
+}
+static struct sysdev_class nmi_sysclass = {
+        set_kset_name("lapic_nmi"),
+        .resume         = lapic_nmi_resume,
+        .suspend        = lapic_nmi_suspend,
+};
+static struct sys_device device_lapic_nmi = {
+        .id     = 0,
+        .cls    = &nmi_sysclass,
+};
+static int __init init_lapic_nmi_sysfs(void)
+{
+        int error;
+        /* should really be a BUG_ON but b/c this is an
+         * init call, it just doesn't work.  -dcz
+         */
+        if (nmi_watchdog != NMI_LOCAL_APIC)
+                return 0;
+        if (atomic_read(&nmi_active) < 0)
+                return 0;
+        error = sysdev_class_register(&nmi_sysclass);
+        if (!error)
+                error = sysdev_register(&device_lapic_nmi);
+        return error;
+}
+/* must come after the local APIC's device_initcall() */
+late_initcall(init_lapic_nmi_sysfs);
+#endif  /* CONFIG_PM */
+static void __acpi_nmi_enable(void *__unused)
+{
+        apic_write_around(APIC_LVT0, APIC_DM_NMI);
+}
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+        if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+                on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
+static void __acpi_nmi_disable(void *__unused)
+{
+        apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+        if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+                on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+void setup_apic_nmi_watchdog (void *unused)
+{
+        if (__get_cpu_var(wd_enabled))
+                return;
+        /* cheap hack to support suspend/resume */
+        /* if cpu0 is not active neither should the other cpus */
+        if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
+                return;
+        switch (nmi_watchdog) {
+        case NMI_LOCAL_APIC:
+                __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
+                if (lapic_watchdog_init(nmi_hz) < 0) {
+                        __get_cpu_var(wd_enabled) = 0;
+                        return;
+                }
+                /* FALL THROUGH */
+        case NMI_IO_APIC:
+                __get_cpu_var(wd_enabled) = 1;
+                atomic_inc(&nmi_active);
+        }
+}
+void stop_apic_nmi_watchdog(void *unused)
+{
+        /* only support LOCAL and IO APICs for now */
+        if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+            (nmi_watchdog != NMI_IO_APIC))
+                return;
+        if (__get_cpu_var(wd_enabled) == 0)
+                return;
+        if (nmi_watchdog == NMI_LOCAL_APIC)
+                lapic_watchdog_stop();
+        __get_cpu_var(wd_enabled) = 0;
+        atomic_dec(&nmi_active);
+}
+/*
+ * the best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ *
+ * since NMIs don't listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk might lock
+ * up though, so we have to break up any console locks first ...
+ * [when there will be more tty-related locks, break them up
+ *  here too!]
+ */
+static unsigned int
+        last_irq_sums [NR_CPUS],
+        alert_counter [NR_CPUS];
+void touch_nmi_watchdog(void)
+{
+        if (nmi_watchdog > 0) {
+                unsigned cpu;
+                /*
+                 * Just reset the alert counters, (other CPUs might be
+                 * spinning on locks we hold):
+                 */
+                for_each_present_cpu(cpu) {
+                        if (alert_counter[cpu])
+                                alert_counter[cpu] = 0;
+                }
+        }
+        /*
+         * Tickle the softlockup detector too:
+         */
+        touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+extern void die_nmi(struct pt_regs *, const char *msg);
+__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+{
+        /*
+         * Since current_thread_info()-> is always on the stack, and we
+         * always switch the stack NMI-atomically, it's safe to use
+         * smp_processor_id().
+         */
+        unsigned int sum;
+        int touched = 0;
+        int cpu = smp_processor_id();
+        int rc=0;
+        /* check for other users first */
+        if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+                        == NOTIFY_STOP) {
+                rc = 1;
+                touched = 1;
+        }
+        if (cpu_isset(cpu, backtrace_mask)) {
+                static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
+                spin_lock(&lock);
+                printk("NMI backtrace for cpu %d\n", cpu);
+                dump_stack();
+                spin_unlock(&lock);
+                cpu_clear(cpu, backtrace_mask);
+        }
+        /*
+         * Take the local apic timer and PIT/HPET into account. We don't
+         * know which one is active, when we have highres/dyntick on
+         */
+        sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0];
+        /* if the none of the timers isn't firing, this cpu isn't doing much */
+        if (!touched && last_irq_sums[cpu] == sum) {
+                /*
+                 * Ayiee, looks like this CPU is stuck ...
+                 * wait a few IRQs (5 seconds) before doing the oops ...
+                 */
+                alert_counter[cpu]++;
+                if (alert_counter[cpu] == 5*nmi_hz)
+                        /*
+                         * die_nmi will return ONLY if NOTIFY_STOP happens..
+                         */
+                        die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
+        } else {
+                last_irq_sums[cpu] = sum;
+                alert_counter[cpu] = 0;
+        }
+        /* see if the nmi watchdog went off */
+        if (!__get_cpu_var(wd_enabled))
+                return rc;
+        switch (nmi_watchdog) {
+        case NMI_LOCAL_APIC:
+                rc |= lapic_wd_event(nmi_hz);
+                break;
+        case NMI_IO_APIC:
+                /* don't know how to accurately check for this.
+                 * just assume it was a watchdog timer interrupt
+                 * This matches the old behaviour.
+                 */
+                rc = 1;
+                break;
+        }
+        return rc;
+}
+int do_nmi_callback(struct pt_regs * regs, int cpu)
+{
+#ifdef CONFIG_SYSCTL
+        if (unknown_nmi_panic)
+                return unknown_nmi_panic_callback(regs, cpu);
+#endif
+        return 0;
+}
+#ifdef CONFIG_SYSCTL
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
+{
+        unsigned char reason = get_nmi_reason();
+        char buf[64];
+        sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+        die_nmi(regs, buf);
+        return 0;
+}
+/*
+ * proc handler for /proc/sys/kernel/nmi
+ */
+int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+                        void __user *buffer, size_t *length, loff_t *ppos)
+{
+        int old_state;
+        nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+        old_state = nmi_watchdog_enabled;
+        proc_dointvec(table, write, file, buffer, length, ppos);
+        if (!!old_state == !!nmi_watchdog_enabled)
+                return 0;
+        if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
+                printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
+                return -EIO;
+        }
+        if (nmi_watchdog == NMI_DEFAULT) {
+                if (lapic_watchdog_ok())
+                        nmi_watchdog = NMI_LOCAL_APIC;
+                else
+                        nmi_watchdog = NMI_IO_APIC;
+        }
+        if (nmi_watchdog == NMI_LOCAL_APIC) {
+                if (nmi_watchdog_enabled)
+                        enable_lapic_nmi_watchdog();
+                else
+                        disable_lapic_nmi_watchdog();
+        } else {
+                printk( KERN_WARNING
+                        "NMI watchdog doesn't know what hardware to touch\n");
+                return -EIO;
+        }
+        return 0;
+}
+#endif
+void __trigger_all_cpu_backtrace(void)
+{
+        int i;
+        backtrace_mask = cpu_online_map;
+        /* Wait for up to 10 seconds for all CPUs to do the backtrace */
+        for (i = 0; i < 10 * 1000; i++) {
+                if (cpus_empty(backtrace_mask))
+                        break;
+                mdelay(1);
+        }
+}
+EXPORT_SYMBOL(nmi_active);
+EXPORT_SYMBOL(nmi_watchdog);
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
new file mode 100644
index 000000000000..0ec6d2ddb931
--- /dev/null
+++ b/arch/x86/kernel/nmi_64.c
@@ -0,0 +1,483 @@
+/*
+ *  linux/arch/x86_64/nmi.c
+ *
+ *  NMI watchdog support on APIC systems
+ *
+ *  Started by Ingo Molnar <mingo@redhat.com>
+ *
+ *  Fixes:
+ *  Mikael Pettersson   : AMD K7 support for local APIC NMI watchdog.
+ *  Mikael Pettersson   : Power Management for local APIC NMI watchdog.
+ *  Pavel Machek and
+ *  Mikael Pettersson   : PM converted to driver model. Disable/enable API.
+ */
+#include <linux/nmi.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/sysctl.h>
+#include <linux/kprobes.h>
+#include <linux/cpumask.h>
+#include <linux/kdebug.h>
+#include <asm/smp.h>
+#include <asm/nmi.h>
+#include <asm/proto.h>
+#include <asm/mce.h>
+int unknown_nmi_panic;
+int nmi_watchdog_enabled;
+int panic_on_unrecovered_nmi;
+static cpumask_t backtrace_mask = CPU_MASK_NONE;
+/* nmi_active:
+ * >0: the lapic NMI watchdog is active, but can be disabled
+ * <0: the lapic NMI watchdog has not been set up, and cannot
+ *     be enabled
+ *  0: the lapic NMI watchdog is disabled, but can be enabled
+ */
+atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
+int panic_on_timeout;
+unsigned int nmi_watchdog = NMI_DEFAULT;
+static unsigned int nmi_hz = HZ;
+static DEFINE_PER_CPU(short, wd_enabled);
+/* local prototypes */
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
+/* Run after command line and cpu_init init, but before all other checks */
+void nmi_watchdog_default(void)
+{
+        if (nmi_watchdog != NMI_DEFAULT)
+                return;
+        nmi_watchdog = NMI_NONE;
+}
+static int endflag __initdata = 0;
+#ifdef CONFIG_SMP
+/* The performance counters used by NMI_LOCAL_APIC don't trigger when
+ * the CPU is idle. To make sure the NMI watchdog really ticks on all
+ * CPUs during the test make them busy.
+ */
+static __init void nmi_cpu_busy(void *data)
+{
+        local_irq_enable_in_hardirq();
+        /* Intentionally don't use cpu_relax here. This is
+           to make sure that the performance counter really ticks,
+           even if there is a simulator or similar that catches the
+           pause instruction. On a real HT machine this is fine because
+           all other CPUs are busy with "useless" delay loops and don't
+           care if they get somewhat less cycles. */
+        while (endflag == 0)
+                mb();
+}
+#endif
+int __init check_nmi_watchdog (void)
+{
+        int *counts;
+        int cpu;
+        if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) 
+                return 0;
+        if (!atomic_read(&nmi_active))
+                return 0;
+        counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+        if (!counts)
+                return -1;
+        printk(KERN_INFO "testing NMI watchdog ... ");
+#ifdef CONFIG_SMP
+        if (nmi_watchdog == NMI_LOCAL_APIC)
+                smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+#endif
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+                counts[cpu] = cpu_pda(cpu)->__nmi_count;
+        local_irq_enable();
+        mdelay((20*1000)/nmi_hz); // wait 20 ticks
+        for_each_online_cpu(cpu) {
+                if (!per_cpu(wd_enabled, cpu))
+                        continue;
+                if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
+                        printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
+                               cpu,
+                               counts[cpu],
+                               cpu_pda(cpu)->__nmi_count);
+                        per_cpu(wd_enabled, cpu) = 0;
+                        atomic_dec(&nmi_active);
+                }
+        }
+        if (!atomic_read(&nmi_active)) {
+                kfree(counts);
+                atomic_set(&nmi_active, -1);
+                endflag = 1;
+                return -1;
+        }
+        endflag = 1;
+        printk("OK.\n");
+        /* now that we know it works we can reduce NMI frequency to
+           something more reasonable; makes a difference in some configs */
+        if (nmi_watchdog == NMI_LOCAL_APIC)
+                nmi_hz = lapic_adjust_nmi_hz(1);
+        kfree(counts);
+        return 0;
+}
+int __init setup_nmi_watchdog(char *str)
+{
+        int nmi;
+        if (!strncmp(str,"panic",5)) {
+                panic_on_timeout = 1;
+                str = strchr(str, ',');
+                if (!str)
+                        return 1;
+                ++str;
+        }
+        get_option(&str, &nmi);
+        if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
+                return 0;
+        nmi_watchdog = nmi;
+        return 1;
+}
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+static void __acpi_nmi_disable(void *__unused)
+{
+        apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+        if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+                on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+static void __acpi_nmi_enable(void *__unused)
+{
+        apic_write(APIC_LVT0, APIC_DM_NMI);
+}
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+        if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+                on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
+#ifdef CONFIG_PM
+static int nmi_pm_active; /* nmi_active before suspend */
+static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
+{
+        /* only CPU0 goes here, other CPUs should be offline */
+        nmi_pm_active = atomic_read(&nmi_active);
+        stop_apic_nmi_watchdog(NULL);
+        BUG_ON(atomic_read(&nmi_active) != 0);
+        return 0;
+}
+static int lapic_nmi_resume(struct sys_device *dev)
+{
+        /* only CPU0 goes here, other CPUs should be offline */
+        if (nmi_pm_active > 0) {
+                setup_apic_nmi_watchdog(NULL);
+                touch_nmi_watchdog();
+        }
+        return 0;
+}
+static struct sysdev_class nmi_sysclass = {
+        set_kset_name("lapic_nmi"),
+        .resume         = lapic_nmi_resume,
+        .suspend        = lapic_nmi_suspend,
+};
+static struct sys_device device_lapic_nmi = {
+        .id             = 0,
+        .cls    = &nmi_sysclass,
+};
+static int __init init_lapic_nmi_sysfs(void)
+{
+        int error;
+        /* should really be a BUG_ON but b/c this is an
+         * init call, it just doesn't work.  -dcz
+         */
+        if (nmi_watchdog != NMI_LOCAL_APIC)
+                return 0;
+        if ( atomic_read(&nmi_active) < 0 )
+                return 0;
+        error = sysdev_class_register(&nmi_sysclass);
+        if (!error)
+                error = sysdev_register(&device_lapic_nmi);
+        return error;
+}
+/* must come after the local APIC's device_initcall() */
+late_initcall(init_lapic_nmi_sysfs);
+#endif  /* CONFIG_PM */
+void setup_apic_nmi_watchdog(void *unused)
+{
+        if (__get_cpu_var(wd_enabled) == 1)
+                return;
+        /* cheap hack to support suspend/resume */
+        /* if cpu0 is not active neither should the other cpus */
+        if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
+                return;
+        switch (nmi_watchdog) {
+        case NMI_LOCAL_APIC:
+                __get_cpu_var(wd_enabled) = 1;
+                if (lapic_watchdog_init(nmi_hz) < 0) {
+                        __get_cpu_var(wd_enabled) = 0;
+                        return;
+                }
+                /* FALL THROUGH */
+        case NMI_IO_APIC:
+                __get_cpu_var(wd_enabled) = 1;
+                atomic_inc(&nmi_active);
+        }
+}
+void stop_apic_nmi_watchdog(void *unused)
+{
+        /* only support LOCAL and IO APICs for now */
+        if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+            (nmi_watchdog != NMI_IO_APIC))
+                return;
+        if (__get_cpu_var(wd_enabled) == 0)
+                return;
+        if (nmi_watchdog == NMI_LOCAL_APIC)
+                lapic_watchdog_stop();
+        __get_cpu_var(wd_enabled) = 0;
+        atomic_dec(&nmi_active);
+}
+/*
+ * the best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ */
+static DEFINE_PER_CPU(unsigned, last_irq_sum);
+static DEFINE_PER_CPU(local_t, alert_counter);
+static DEFINE_PER_CPU(int, nmi_touch);
+void touch_nmi_watchdog(void)
+{
+        if (nmi_watchdog > 0) {
+                unsigned cpu;
+                /*
+                 * Tell other CPUs to reset their alert counters. We cannot
+                 * do it ourselves because the alert count increase is not
+                 * atomic.
+                 */
+                for_each_present_cpu(cpu) {
+                        if (per_cpu(nmi_touch, cpu) != 1)
+                                per_cpu(nmi_touch, cpu) = 1;
+                }
+        }
+        touch_softlockup_watchdog();
+}
+int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+{
+        int sum;
+        int touched = 0;
+        int cpu = smp_processor_id();
+        int rc = 0;
+        /* check for other users first */
+        if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+                        == NOTIFY_STOP) {
+                rc = 1;
+                touched = 1;
+        }
+        sum = read_pda(apic_timer_irqs);
+        if (__get_cpu_var(nmi_touch)) {
+                __get_cpu_var(nmi_touch) = 0;
+                touched = 1;
+        }
+        if (cpu_isset(cpu, backtrace_mask)) {
+                static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
+                spin_lock(&lock);
+                printk("NMI backtrace for cpu %d\n", cpu);
+                dump_stack();
+                spin_unlock(&lock);
+                cpu_clear(cpu, backtrace_mask);
+        }
+#ifdef CONFIG_X86_MCE
+        /* Could check oops_in_progress here too, but it's safer
+           not too */
+        if (atomic_read(&mce_entry) > 0)
+                touched = 1;
+#endif
+        /* if the apic timer isn't firing, this cpu isn't doing much */
+        if (!touched && __get_cpu_var(last_irq_sum) == sum) {
+                /*
+                 * Ayiee, looks like this CPU is stuck ...
+                 * wait a few IRQs (5 seconds) before doing the oops ...
+                 */
+                local_inc(&__get_cpu_var(alert_counter));
+                if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
+                        die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
+                                panic_on_timeout);
+        } else {
+                __get_cpu_var(last_irq_sum) = sum;
+                local_set(&__get_cpu_var(alert_counter), 0);
+        }
+        /* see if the nmi watchdog went off */
+        if (!__get_cpu_var(wd_enabled))
+                return rc;
+        switch (nmi_watchdog) {
+        case NMI_LOCAL_APIC:
+                rc |= lapic_wd_event(nmi_hz);
+                break;
+        case NMI_IO_APIC:
+                /* don't know how to accurately check for this.
+                 * just assume it was a watchdog timer interrupt
+                 * This matches the old behaviour.
+                 */
+                rc = 1;
+                break;
+        }
+        return rc;
+}
+static unsigned ignore_nmis;
+asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
+{
+        nmi_enter();
+        add_pda(__nmi_count,1);
+        if (!ignore_nmis)
+                default_do_nmi(regs);
+        nmi_exit();
+}
+int do_nmi_callback(struct pt_regs * regs, int cpu)
+{
+#ifdef CONFIG_SYSCTL
+        if (unknown_nmi_panic)
+                return unknown_nmi_panic_callback(regs, cpu);
+#endif
+        return 0;
+}
+void stop_nmi(void)
+{
+        acpi_nmi_disable();
+        ignore_nmis++;
+}
+void restart_nmi(void)
+{
+        ignore_nmis--;
+        acpi_nmi_enable();
+}
+#ifdef CONFIG_SYSCTL
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
+{
+        unsigned char reason = get_nmi_reason();
+        char buf[64];
+        sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+        die_nmi(buf, regs, 1);  /* Always panic here */
+        return 0;
+}
+/*
+ * proc handler for /proc/sys/kernel/nmi
+ */
+int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+                        void __user *buffer, size_t *length, loff_t *ppos)
+{
+        int old_state;
+        nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+        old_state = nmi_watchdog_enabled;
+        proc_dointvec(table, write, file, buffer, length, ppos);
+        if (!!old_state == !!nmi_watchdog_enabled)
+                return 0;
+        if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
+                printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
+                return -EIO;
+        }
+        /* if nmi_watchdog is not set yet, then set it */
+        nmi_watchdog_default();
+        if (nmi_watchdog == NMI_LOCAL_APIC) {
+                if (nmi_watchdog_enabled)
+                        enable_lapic_nmi_watchdog();
+                else
+                        disable_lapic_nmi_watchdog();
+        } else {
+                printk( KERN_WARNING
+                        "NMI watchdog doesn't know what hardware to touch\n");
+                return -EIO;
+        }
+        return 0;
+}
+#endif
+void __trigger_all_cpu_backtrace(void)
+{
+        int i;
+        backtrace_mask = cpu_online_map;
+        /* Wait for up to 10 seconds for all CPUs to do the backtrace */
+        for (i = 0; i < 10 * 1000; i++) {
+                if (cpus_empty(backtrace_mask))
+                        break;
+                mdelay(1);
+        }
+}
+EXPORT_SYMBOL(nmi_active);
+EXPORT_SYMBOL(nmi_watchdog);
+EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
new file mode 100644
index 000000000000..9000d82c6dc0
--- /dev/null
+++ b/arch/x86/kernel/numaq_32.c
@@ -0,0 +1,89 @@
+/*
+ * Written by: Patricia Gaughen, IBM Corporation
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <gone@us.ibm.com>
+ */
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <asm/numaq.h>
+#include <asm/topology.h>
+#include <asm/processor.h>
+#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
+/*
+ * Function: smp_dump_qct()
+ *
+ * Description: gets memory layout from the quad config table.  This
+ * function also updates node_online_map with the nodes (quads) present.
+ */
+static void __init smp_dump_qct(void)
+{
+        int node;
+        struct eachquadmem *eq;
+        struct sys_cfg_data *scd =
+                (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR);
+        nodes_clear(node_online_map);
+        for_each_node(node) {
+                if (scd->quads_present31_0 & (1 << node)) {
+                        node_set_online(node);
+                        eq = &scd->eq[node];
+                        /* Convert to pages */
+                        node_start_pfn[node] = MB_TO_PAGES(
+                                eq->hi_shrd_mem_start - eq->priv_mem_size);
+                        node_end_pfn[node] = MB_TO_PAGES(
+                                eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
+                        memory_present(node,
+                                node_start_pfn[node], node_end_pfn[node]);
+                        node_remap_size[node] = node_memmap_size_bytes(node,
+                                                        node_start_pfn[node],
+                                                        node_end_pfn[node]);
+                }
+        }
+}
+/*
+ * Unlike Summit, we don't really care to let the NUMA-Q
+ * fall back to flat mode.  Don't compile for NUMA-Q
+ * unless you really need it!
+ */
+int __init get_memcfg_numaq(void)
+{
+        smp_dump_qct();
+        return 1;
+}
+static int __init numaq_tsc_disable(void)
+{
+        if (num_online_nodes() > 1) {
+                printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
+                tsc_disable = 1;
+        }
+        return 0;
+}
+arch_initcall(numaq_tsc_disable);
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
new file mode 100644
index 000000000000..739cfb207dd7
--- /dev/null
+++ b/arch/x86/kernel/paravirt_32.c
@@ -0,0 +1,392 @@
+/*  Paravirtualization interfaces
+    Copyright (C) 2006 Rusty Russell IBM Corporation
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/bcd.h>
+#include <linux/highmem.h>
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/delay.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>
+#include <asm/tlbflush.h>
+#include <asm/timer.h>
+/* nop stub */
+void _paravirt_nop(void)
+{
+}
+static void __init default_banner(void)
+{
+        printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+               paravirt_ops.name);
+}
+char *memory_setup(void)
+{
+        return paravirt_ops.memory_setup();
+}
+/* Simple instruction patching code. */
+#define DEF_NATIVE(name, code)                                  \
+        extern const char start_##name[], end_##name[];         \
+        asm("start_" #name ": " code "; end_" #name ":")
+DEF_NATIVE(irq_disable, "cli");
+DEF_NATIVE(irq_enable, "sti");
+DEF_NATIVE(restore_fl, "push %eax; popf");
+DEF_NATIVE(save_fl, "pushf; pop %eax");
+DEF_NATIVE(iret, "iret");
+DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
+DEF_NATIVE(read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(read_cr3, "mov %cr3, %eax");
+DEF_NATIVE(clts, "clts");
+DEF_NATIVE(read_tsc, "rdtsc");
+DEF_NATIVE(ud2a, "ud2a");
+static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+                             unsigned long addr, unsigned len)
+{
+        const unsigned char *start, *end;
+        unsigned ret;
+        switch(type) {
+#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site
+                SITE(irq_disable);
+                SITE(irq_enable);
+                SITE(restore_fl);
+                SITE(save_fl);
+                SITE(iret);
+                SITE(irq_enable_sysexit);
+                SITE(read_cr2);
+                SITE(read_cr3);
+                SITE(write_cr3);
+                SITE(clts);
+                SITE(read_tsc);
+#undef SITE
+        patch_site:
+                ret = paravirt_patch_insns(ibuf, len, start, end);
+                break;
+        case PARAVIRT_PATCH(make_pgd):
+        case PARAVIRT_PATCH(make_pte):
+        case PARAVIRT_PATCH(pgd_val):
+        case PARAVIRT_PATCH(pte_val):
+#ifdef CONFIG_X86_PAE
+        case PARAVIRT_PATCH(make_pmd):
+        case PARAVIRT_PATCH(pmd_val):
+#endif
+                /* These functions end up returning exactly what
+                   they're passed, in the same registers. */
+                ret = paravirt_patch_nop();
+                break;
+        default:
+                ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
+                break;
+        }
+        return ret;
+}
+unsigned paravirt_patch_nop(void)
+{
+        return 0;
+}
+unsigned paravirt_patch_ignore(unsigned len)
+{
+        return len;
+}
+struct branch {
+        unsigned char opcode;
+        u32 delta;
+} __attribute__((packed));
+unsigned paravirt_patch_call(void *insnbuf,
+                             const void *target, u16 tgt_clobbers,
+                             unsigned long addr, u16 site_clobbers,
+                             unsigned len)
+{
+        struct branch *b = insnbuf;
+        unsigned long delta = (unsigned long)target - (addr+5);
+        if (tgt_clobbers & ~site_clobbers)
+                return len;     /* target would clobber too much for this site */
+        if (len < 5)
+                return len;     /* call too long for patch site */
+        b->opcode = 0xe8; /* call */
+        b->delta = delta;
+        BUILD_BUG_ON(sizeof(*b) != 5);
+        return 5;
+}
+unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
+                            unsigned long addr, unsigned len)
+{
+        struct branch *b = insnbuf;
+        unsigned long delta = (unsigned long)target - (addr+5);
+        if (len < 5)
+                return len;     /* call too long for patch site */
+        b->opcode = 0xe9;       /* jmp */
+        b->delta = delta;
+        return 5;
+}
+unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
+                                unsigned long addr, unsigned len)
+{
+        void *opfunc = *((void **)&paravirt_ops + type);
+        unsigned ret;
+        if (opfunc == NULL)
+                /* If there's no function, patch it with a ud2a (BUG) */
+                ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a);
+        else if (opfunc == paravirt_nop)
+                /* If the operation is a nop, then nop the callsite */
+                ret = paravirt_patch_nop();
+        else if (type == PARAVIRT_PATCH(iret) ||
+                 type == PARAVIRT_PATCH(irq_enable_sysexit))
+                /* If operation requires a jmp, then jmp */
+                ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len);
+        else
+                /* Otherwise call the function; assume target could
+                   clobber any caller-save reg */
+                ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
+                                          addr, clobbers, len);
+        return ret;
+}
+unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
+                              const char *start, const char *end)
+{
+        unsigned insn_len = end - start;
+        if (insn_len > len || start == NULL)
+                insn_len = len;
+        else
+                memcpy(insnbuf, start, insn_len);
+        return insn_len;
+}
+void init_IRQ(void)
+{
+        paravirt_ops.init_IRQ();
+}
+static void native_flush_tlb(void)
+{
+        __native_flush_tlb();
+}
+/*
+ * Global pages have to be flushed a bit differently. Not a real
+ * performance problem because this does not happen often.
+ */
+static void native_flush_tlb_global(void)
+{
+        __native_flush_tlb_global();
+}
+static void native_flush_tlb_single(unsigned long addr)
+{
+        __native_flush_tlb_single(addr);
+}
+/* These are in entry.S */
+extern void native_iret(void);
+extern void native_irq_enable_sysexit(void);
+static int __init print_banner(void)
+{
+        paravirt_ops.banner();
+        return 0;
+}
+core_initcall(print_banner);
+static struct resource reserve_ioports = {
+        .start = 0,
+        .end = IO_SPACE_LIMIT,
+        .name = "paravirt-ioport",
+        .flags = IORESOURCE_IO | IORESOURCE_BUSY,
+};
+static struct resource reserve_iomem = {
+        .start = 0,
+        .end = -1,
+        .name = "paravirt-iomem",
+        .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+/*
+ * Reserve the whole legacy IO space to prevent any legacy drivers
+ * from wasting time probing for their hardware.  This is a fairly
+ * brute-force approach to disabling all non-virtual drivers.
+ *
+ * Note that this must be called very early to have any effect.
+ */
+int paravirt_disable_iospace(void)
+{
+        int ret;
+        ret = request_resource(&ioport_resource, &reserve_ioports);
+        if (ret == 0) {
+                ret = request_resource(&iomem_resource, &reserve_iomem);
+                if (ret)
+                        release_resource(&reserve_ioports);
+        }
+        return ret;
+}
+struct paravirt_ops paravirt_ops = {
+        .name = "bare hardware",
+        .paravirt_enabled = 0,
+        .kernel_rpl = 0,
+        .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
+        .patch = native_patch,
+        .banner = default_banner,
+        .arch_setup = paravirt_nop,
+        .memory_setup = machine_specific_memory_setup,
+        .get_wallclock = native_get_wallclock,
+        .set_wallclock = native_set_wallclock,
+        .time_init = hpet_time_init,
+        .init_IRQ = native_init_IRQ,
+        .cpuid = native_cpuid,
+        .get_debugreg = native_get_debugreg,
+        .set_debugreg = native_set_debugreg,
+        .clts = native_clts,
+        .read_cr0 = native_read_cr0,
+        .write_cr0 = native_write_cr0,
+        .read_cr2 = native_read_cr2,
+        .write_cr2 = native_write_cr2,
+        .read_cr3 = native_read_cr3,
+        .write_cr3 = native_write_cr3,
+        .read_cr4 = native_read_cr4,
+        .read_cr4_safe = native_read_cr4_safe,
+        .write_cr4 = native_write_cr4,
+        .save_fl = native_save_fl,
+        .restore_fl = native_restore_fl,
+        .irq_disable = native_irq_disable,
+        .irq_enable = native_irq_enable,
+        .safe_halt = native_safe_halt,
+        .halt = native_halt,
+        .wbinvd = native_wbinvd,
+        .read_msr = native_read_msr_safe,
+        .write_msr = native_write_msr_safe,
+        .read_tsc = native_read_tsc,
+        .read_pmc = native_read_pmc,
+        .sched_clock = native_sched_clock,
+        .get_cpu_khz = native_calculate_cpu_khz,
+        .load_tr_desc = native_load_tr_desc,
+        .set_ldt = native_set_ldt,
+        .load_gdt = native_load_gdt,
+        .load_idt = native_load_idt,
+        .store_gdt = native_store_gdt,
+        .store_idt = native_store_idt,
+        .store_tr = native_store_tr,
+        .load_tls = native_load_tls,
+        .write_ldt_entry = write_dt_entry,
+        .write_gdt_entry = write_dt_entry,
+        .write_idt_entry = write_dt_entry,
+        .load_esp0 = native_load_esp0,
+        .set_iopl_mask = native_set_iopl_mask,
+        .io_delay = native_io_delay,
+#ifdef CONFIG_X86_LOCAL_APIC
+        .apic_write = native_apic_write,
+        .apic_write_atomic = native_apic_write_atomic,
+        .apic_read = native_apic_read,
+        .setup_boot_clock = setup_boot_APIC_clock,
+        .setup_secondary_clock = setup_secondary_APIC_clock,
+        .startup_ipi_hook = paravirt_nop,
+#endif
+        .set_lazy_mode = paravirt_nop,
+        .pagetable_setup_start = native_pagetable_setup_start,
+        .pagetable_setup_done = native_pagetable_setup_done,
+        .flush_tlb_user = native_flush_tlb,
+        .flush_tlb_kernel = native_flush_tlb_global,
+        .flush_tlb_single = native_flush_tlb_single,
+        .flush_tlb_others = native_flush_tlb_others,
+        .alloc_pt = paravirt_nop,
+        .alloc_pd = paravirt_nop,
+        .alloc_pd_clone = paravirt_nop,
+        .release_pt = paravirt_nop,
+        .release_pd = paravirt_nop,
+        .set_pte = native_set_pte,
+        .set_pte_at = native_set_pte_at,
+        .set_pmd = native_set_pmd,
+        .pte_update = paravirt_nop,
+        .pte_update_defer = paravirt_nop,
+#ifdef CONFIG_HIGHPTE
+        .kmap_atomic_pte = kmap_atomic,
+#endif
+#ifdef CONFIG_X86_PAE
+        .set_pte_atomic = native_set_pte_atomic,
+        .set_pte_present = native_set_pte_present,
+        .set_pud = native_set_pud,
+        .pte_clear = native_pte_clear,
+        .pmd_clear = native_pmd_clear,
+        .pmd_val = native_pmd_val,
+        .make_pmd = native_make_pmd,
+#endif
+        .pte_val = native_pte_val,
+        .pgd_val = native_pgd_val,
+        .make_pte = native_make_pte,
+        .make_pgd = native_make_pgd,
+        .irq_enable_sysexit = native_irq_enable_sysexit,
+        .iret = native_iret,
+        .dup_mmap = paravirt_nop,
+        .exit_mmap = paravirt_nop,
+        .activate_mm = paravirt_nop,
+};
+EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
new file mode 100644
index 000000000000..71da01e73f03
--- /dev/null
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -0,0 +1,1578 @@
+/*
+ * Derived from arch/powerpc/kernel/iommu.c
+ *
+ * Copyright IBM Corporation, 2006-2007
+ * Copyright (C) 2006  Jon Mason <jdmason@kudzu.us>
+ *
+ * Author: Jon Mason <jdmason@kudzu.us>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/dma-mapping.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <asm/iommu.h>
+#include <asm/calgary.h>
+#include <asm/tce.h>
+#include <asm/pci-direct.h>
+#include <asm/system.h>
+#include <asm/dma.h>
+#include <asm/rio.h>
+#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
+int use_calgary __read_mostly = 1;
+#else
+int use_calgary __read_mostly = 0;
+#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
+#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
+#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308
+/* register offsets inside the host bridge space */
+#define CALGARY_CONFIG_REG      0x0108
+#define PHB_CSR_OFFSET          0x0110 /* Channel Status */
+#define PHB_PLSSR_OFFSET        0x0120
+#define PHB_CONFIG_RW_OFFSET    0x0160
+#define PHB_IOBASE_BAR_LOW      0x0170
+#define PHB_IOBASE_BAR_HIGH     0x0180
+#define PHB_MEM_1_LOW           0x0190
+#define PHB_MEM_1_HIGH          0x01A0
+#define PHB_IO_ADDR_SIZE        0x01B0
+#define PHB_MEM_1_SIZE          0x01C0
+#define PHB_MEM_ST_OFFSET       0x01D0
+#define PHB_AER_OFFSET          0x0200
+#define PHB_CONFIG_0_HIGH       0x0220
+#define PHB_CONFIG_0_LOW        0x0230
+#define PHB_CONFIG_0_END        0x0240
+#define PHB_MEM_2_LOW           0x02B0
+#define PHB_MEM_2_HIGH          0x02C0
+#define PHB_MEM_2_SIZE_HIGH     0x02D0
+#define PHB_MEM_2_SIZE_LOW      0x02E0
+#define PHB_DOSHOLE_OFFSET      0x08E0
+/* CalIOC2 specific */
+#define PHB_SAVIOR_L2           0x0DB0
+#define PHB_PAGE_MIG_CTRL       0x0DA8
+#define PHB_PAGE_MIG_DEBUG      0x0DA0
+#define PHB_ROOT_COMPLEX_STATUS 0x0CB0
+/* PHB_CONFIG_RW */
+#define PHB_TCE_ENABLE          0x20000000
+#define PHB_SLOT_DISABLE        0x1C000000
+#define PHB_DAC_DISABLE         0x01000000
+#define PHB_MEM2_ENABLE         0x00400000
+#define PHB_MCSR_ENABLE         0x00100000
+/* TAR (Table Address Register) */
+#define TAR_SW_BITS             0x0000ffffffff800fUL
+#define TAR_VALID               0x0000000000000008UL
+/* CSR (Channel/DMA Status Register) */
+#define CSR_AGENT_MASK          0xffe0ffff
+/* CCR (Calgary Configuration Register) */
+#define CCR_2SEC_TIMEOUT        0x000000000000000EUL
+/* PMCR/PMDR (Page Migration Control/Debug Registers */
+#define PMR_SOFTSTOP            0x80000000
+#define PMR_SOFTSTOPFAULT       0x40000000
+#define PMR_HARDSTOP            0x20000000
+#define MAX_NUM_OF_PHBS         8 /* how many PHBs in total? */
+#define MAX_NUM_CHASSIS         8 /* max number of chassis */
+/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */
+#define MAX_PHB_BUS_NUM         (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2)
+#define PHBS_PER_CALGARY        4
+/* register offsets in Calgary's internal register space */
+static const unsigned long tar_offsets[] = {
+        0x0580 /* TAR0 */,
+        0x0588 /* TAR1 */,
+        0x0590 /* TAR2 */,
+        0x0598 /* TAR3 */
+};
+static const unsigned long split_queue_offsets[] = {
+        0x4870 /* SPLIT QUEUE 0 */,
+        0x5870 /* SPLIT QUEUE 1 */,
+        0x6870 /* SPLIT QUEUE 2 */,
+        0x7870 /* SPLIT QUEUE 3 */
+};
+static const unsigned long phb_offsets[] = {
+        0x8000 /* PHB0 */,
+        0x9000 /* PHB1 */,
+        0xA000 /* PHB2 */,
+        0xB000 /* PHB3 */
+};
+/* PHB debug registers */
+static const unsigned long phb_debug_offsets[] = {
+        0x4000  /* PHB 0 DEBUG */,
+        0x5000  /* PHB 1 DEBUG */,
+        0x6000  /* PHB 2 DEBUG */,
+        0x7000  /* PHB 3 DEBUG */
+};
+/*
+ * STUFF register for each debug PHB,
+ * byte 1 = start bus number, byte 2 = end bus number
+ */
+#define PHB_DEBUG_STUFF_OFFSET  0x0020
+#define EMERGENCY_PAGES 32 /* = 128KB */
+unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
+static int translate_empty_slots __read_mostly = 0;
+static int calgary_detected __read_mostly = 0;
+static struct rio_table_hdr     *rio_table_hdr __initdata;
+static struct scal_detail       *scal_devs[MAX_NUMNODES] __initdata;
+static struct rio_detail        *rio_devs[MAX_NUMNODES * 4] __initdata;
+struct calgary_bus_info {
+        void *tce_space;
+        unsigned char translation_disabled;
+        signed char phbid;
+        void __iomem *bbar;
+};
+static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
+static void calgary_tce_cache_blast(struct iommu_table *tbl);
+static void calgary_dump_error_regs(struct iommu_table *tbl);
+static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
+static void calioc2_tce_cache_blast(struct iommu_table *tbl);
+static void calioc2_dump_error_regs(struct iommu_table *tbl);
+static struct cal_chipset_ops calgary_chip_ops = {
+        .handle_quirks = calgary_handle_quirks,
+        .tce_cache_blast = calgary_tce_cache_blast,
+        .dump_error_regs = calgary_dump_error_regs
+};
+static struct cal_chipset_ops calioc2_chip_ops = {
+        .handle_quirks = calioc2_handle_quirks,
+        .tce_cache_blast = calioc2_tce_cache_blast,
+        .dump_error_regs = calioc2_dump_error_regs
+};
+static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
+/* enable this to stress test the chip's TCE cache */
+#ifdef CONFIG_IOMMU_DEBUG
+int debugging __read_mostly = 1;
+static inline unsigned long verify_bit_range(unsigned long* bitmap,
+        int expected, unsigned long start, unsigned long end)
+{
+        unsigned long idx = start;
+        BUG_ON(start >= end);
+        while (idx < end) {
+                if (!!test_bit(idx, bitmap) != expected)
+                        return idx;
+                ++idx;
+        }
+        /* all bits have the expected value */
+        return ~0UL;
+}
+#else /* debugging is disabled */
+int debugging __read_mostly = 0;
+static inline unsigned long verify_bit_range(unsigned long* bitmap,
+        int expected, unsigned long start, unsigned long end)
+{
+        return ~0UL;
+}
+#endif /* CONFIG_IOMMU_DEBUG */
+static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
+{
+        unsigned int npages;
+        npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
+        npages >>= PAGE_SHIFT;
+        return npages;
+}
+static inline int translate_phb(struct pci_dev* dev)
+{
+        int disabled = bus_info[dev->bus->number].translation_disabled;
+        return !disabled;
+}
+static void iommu_range_reserve(struct iommu_table *tbl,
+        unsigned long start_addr, unsigned int npages)
+{
+        unsigned long index;
+        unsigned long end;
+        unsigned long badbit;
+        unsigned long flags;
+        index = start_addr >> PAGE_SHIFT;
+        /* bail out if we're asked to reserve a region we don't cover */
+        if (index >= tbl->it_size)
+                return;
+        end = index + npages;
+        if (end > tbl->it_size) /* don't go off the table */
+                end = tbl->it_size;
+        spin_lock_irqsave(&tbl->it_lock, flags);
+        badbit = verify_bit_range(tbl->it_map, 0, index, end);
+        if (badbit != ~0UL) {
+                if (printk_ratelimit())
+                        printk(KERN_ERR "Calgary: entry already allocated at "
+                               "0x%lx tbl %p dma 0x%lx npages %u\n",
+                               badbit, tbl, start_addr, npages);
+        }
+        set_bit_string(tbl->it_map, index, npages);
+        spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+static unsigned long iommu_range_alloc(struct iommu_table *tbl,
+        unsigned int npages)
+{
+        unsigned long flags;
+        unsigned long offset;
+        BUG_ON(npages == 0);
+        spin_lock_irqsave(&tbl->it_lock, flags);
+        offset = find_next_zero_string(tbl->it_map, tbl->it_hint,
+                                       tbl->it_size, npages);
+        if (offset == ~0UL) {
+                tbl->chip_ops->tce_cache_blast(tbl);
+                offset = find_next_zero_string(tbl->it_map, 0,
+                                               tbl->it_size, npages);
+                if (offset == ~0UL) {
+                        printk(KERN_WARNING "Calgary: IOMMU full.\n");
+                        spin_unlock_irqrestore(&tbl->it_lock, flags);
+                        if (panic_on_overflow)
+                                panic("Calgary: fix the allocator.\n");
+                        else
+                                return bad_dma_address;
+                }
+        }
+        set_bit_string(tbl->it_map, offset, npages);
+        tbl->it_hint = offset + npages;
+        BUG_ON(tbl->it_hint > tbl->it_size);
+        spin_unlock_irqrestore(&tbl->it_lock, flags);
+        return offset;
+}
+static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr,
+        unsigned int npages, int direction)
+{
+        unsigned long entry;
+        dma_addr_t ret = bad_dma_address;
+        entry = iommu_range_alloc(tbl, npages);
+        if (unlikely(entry == bad_dma_address))
+                goto error;
+        /* set the return dma address */
+        ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
+        /* put the TCEs in the HW table */
+        tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
+                  direction);
+        return ret;
+error:
+        printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
+               "iommu %p\n", npages, tbl);
+        return bad_dma_address;
+}
+static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+        unsigned int npages)
+{
+        unsigned long entry;
+        unsigned long badbit;
+        unsigned long badend;
+        unsigned long flags;
+        /* were we called with bad_dma_address? */
+        badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
+        if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
+                printk(KERN_ERR "Calgary: driver tried unmapping bad DMA "
+                       "address 0x%Lx\n", dma_addr);
+                WARN_ON(1);
+                return;
+        }
+        entry = dma_addr >> PAGE_SHIFT;
+        BUG_ON(entry + npages > tbl->it_size);
+        tce_free(tbl, entry, npages);
+        spin_lock_irqsave(&tbl->it_lock, flags);
+        badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
+        if (badbit != ~0UL) {
+                if (printk_ratelimit())
+                        printk(KERN_ERR "Calgary: bit is off at 0x%lx "
+                               "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
+                               badbit, tbl, dma_addr, entry, npages);
+        }
+        __clear_bit_string(tbl->it_map, entry, npages);
+        spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+static inline struct iommu_table *find_iommu_table(struct device *dev)
+{
+        struct pci_dev *pdev;
+        struct pci_bus *pbus;
+        struct iommu_table *tbl;
+        pdev = to_pci_dev(dev);
+        pbus = pdev->bus;
+        /* is the device behind a bridge? Look for the root bus */
+        while (pbus->parent)
+                pbus = pbus->parent;
+        tbl = pci_iommu(pbus);
+        BUG_ON(tbl && (tbl->it_busno != pbus->number));
+        return tbl;
+}
+static void calgary_unmap_sg(struct device *dev,
+        struct scatterlist *sglist, int nelems, int direction)
+{
+        struct iommu_table *tbl = find_iommu_table(dev);
+        if (!translate_phb(to_pci_dev(dev)))
+                return;
+        while (nelems--) {
+                unsigned int npages;
+                dma_addr_t dma = sglist->dma_address;
+                unsigned int dmalen = sglist->dma_length;
+                if (dmalen == 0)
+                        break;
+                npages = num_dma_pages(dma, dmalen);
+                iommu_free(tbl, dma, npages);
+                sglist++;
+        }
+}
+static int calgary_nontranslate_map_sg(struct device* dev,
+        struct scatterlist *sg, int nelems, int direction)
+{
+        int i;
+        for (i = 0; i < nelems; i++ ) {
+                struct scatterlist *s = &sg[i];
+                BUG_ON(!s->page);
+                s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
+                s->dma_length = s->length;
+        }
+        return nelems;
+}
+static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
+        int nelems, int direction)
+{
+        struct iommu_table *tbl = find_iommu_table(dev);
+        unsigned long vaddr;
+        unsigned int npages;
+        unsigned long entry;
+        int i;
+        if (!translate_phb(to_pci_dev(dev)))
+                return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
+        for (i = 0; i < nelems; i++ ) {
+                struct scatterlist *s = &sg[i];
+                BUG_ON(!s->page);
+                vaddr = (unsigned long)page_address(s->page) + s->offset;
+                npages = num_dma_pages(vaddr, s->length);
+                entry = iommu_range_alloc(tbl, npages);
+                if (entry == bad_dma_address) {
+                        /* makes sure unmap knows to stop */
+                        s->dma_length = 0;
+                        goto error;
+                }
+                s->dma_address = (entry << PAGE_SHIFT) | s->offset;
+                /* insert into HW table */
+                tce_build(tbl, entry, npages, vaddr & PAGE_MASK,
+                          direction);
+                s->dma_length = s->length;
+        }
+        return nelems;
+error:
+        calgary_unmap_sg(dev, sg, nelems, direction);
+        for (i = 0; i < nelems; i++) {
+                sg[i].dma_address = bad_dma_address;
+                sg[i].dma_length = 0;
+        }
+        return 0;
+}
+static dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
+        size_t size, int direction)
+{
+        dma_addr_t dma_handle = bad_dma_address;
+        unsigned long uaddr;
+        unsigned int npages;
+        struct iommu_table *tbl = find_iommu_table(dev);
+        uaddr = (unsigned long)vaddr;
+        npages = num_dma_pages(uaddr, size);
+        if (translate_phb(to_pci_dev(dev)))
+                dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
+        else
+                dma_handle = virt_to_bus(vaddr);
+        return dma_handle;
+}
+static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
+        size_t size, int direction)
+{
+        struct iommu_table *tbl = find_iommu_table(dev);
+        unsigned int npages;
+        if (!translate_phb(to_pci_dev(dev)))
+                return;
+        npages = num_dma_pages(dma_handle, size);
+        iommu_free(tbl, dma_handle, npages);
+}
+static void* calgary_alloc_coherent(struct device *dev, size_t size,
+        dma_addr_t *dma_handle, gfp_t flag)
+{
+        void *ret = NULL;
+        dma_addr_t mapping;
+        unsigned int npages, order;
+        struct iommu_table *tbl = find_iommu_table(dev);
+        size = PAGE_ALIGN(size); /* size rounded up to full pages */
+        npages = size >> PAGE_SHIFT;
+        order = get_order(size);
+        /* alloc enough pages (and possibly more) */
+        ret = (void *)__get_free_pages(flag, order);
+        if (!ret)
+                goto error;
+        memset(ret, 0, size);
+        if (translate_phb(to_pci_dev(dev))) {
+                /* set up tces to cover the allocated range */
+                mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
+                if (mapping == bad_dma_address)
+                        goto free;
+                *dma_handle = mapping;
+        } else /* non translated slot */
+                *dma_handle = virt_to_bus(ret);
+        return ret;
+free:
+        free_pages((unsigned long)ret, get_order(size));
+        ret = NULL;
+error:
+        return ret;
+}
+static const struct dma_mapping_ops calgary_dma_ops = {
+        .alloc_coherent = calgary_alloc_coherent,
+        .map_single = calgary_map_single,
+        .unmap_single = calgary_unmap_single,
+        .map_sg = calgary_map_sg,
+        .unmap_sg = calgary_unmap_sg,
+};
+static inline void __iomem * busno_to_bbar(unsigned char num)
+{
+        return bus_info[num].bbar;
+}
+static inline int busno_to_phbid(unsigned char num)
+{
+        return bus_info[num].phbid;
+}
+static inline unsigned long split_queue_offset(unsigned char num)
+{
+        size_t idx = busno_to_phbid(num);
+        return split_queue_offsets[idx];
+}
+static inline unsigned long tar_offset(unsigned char num)
+{
+        size_t idx = busno_to_phbid(num);
+        return tar_offsets[idx];
+}
+static inline unsigned long phb_offset(unsigned char num)
+{
+        size_t idx = busno_to_phbid(num);
+        return phb_offsets[idx];
+}
+static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
+{
+        unsigned long target = ((unsigned long)bar) | offset;
+        return (void __iomem*)target;
+}
+static inline int is_calioc2(unsigned short device)
+{
+        return (device == PCI_DEVICE_ID_IBM_CALIOC2);
+}
+static inline int is_calgary(unsigned short device)
+{
+        return (device == PCI_DEVICE_ID_IBM_CALGARY);
+}
+static inline int is_cal_pci_dev(unsigned short device)
+{
+        return (is_calgary(device) || is_calioc2(device));
+}
+static void calgary_tce_cache_blast(struct iommu_table *tbl)
+{
+        u64 val;
+        u32 aer;
+        int i = 0;
+        void __iomem *bbar = tbl->bbar;
+        void __iomem *target;
+        /* disable arbitration on the bus */
+        target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+        aer = readl(target);
+        writel(0, target);
+        /* read plssr to ensure it got there */
+        target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
+        val = readl(target);
+        /* poll split queues until all DMA activity is done */
+        target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
+        do {
+                val = readq(target);
+                i++;
+        } while ((val & 0xff) != 0xff && i < 100);
+        if (i == 100)
+                printk(KERN_WARNING "Calgary: PCI bus not quiesced, "
+                       "continuing anyway\n");
+        /* invalidate TCE cache */
+        target = calgary_reg(bbar, tar_offset(tbl->it_busno));
+        writeq(tbl->tar_val, target);
+        /* enable arbitration */
+        target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+        writel(aer, target);
+        (void)readl(target); /* flush */
+}
+static void calioc2_tce_cache_blast(struct iommu_table *tbl)
+{
+        void __iomem *bbar = tbl->bbar;
+        void __iomem *target;
+        u64 val64;
+        u32 val;
+        int i = 0;
+        int count = 1;
+        unsigned char bus = tbl->it_busno;
+begin:
+        printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast "
+               "sequence - count %d\n", bus, count);
+        /* 1. using the Page Migration Control reg set SoftStop */
+        target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+        val = be32_to_cpu(readl(target));
+        printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target);
+        val |= PMR_SOFTSTOP;
+        printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target);
+        writel(cpu_to_be32(val), target);
+        /* 2. poll split queues until all DMA activity is done */
+        printk(KERN_DEBUG "2a. starting to poll split queues\n");
+        target = calgary_reg(bbar, split_queue_offset(bus));
+        do {
+                val64 = readq(target);
+                i++;
+        } while ((val64 & 0xff) != 0xff && i < 100);
+        if (i == 100)
+                printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, "
+                       "continuing anyway\n");
+        /* 3. poll Page Migration DEBUG for SoftStopFault */
+        target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
+        val = be32_to_cpu(readl(target));
+        printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target);
+        /* 4. if SoftStopFault - goto (1) */
+        if (val & PMR_SOFTSTOPFAULT) {
+                if (++count < 100)
+                        goto begin;
+                else {
+                        printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, "
+                               "aborting TCE cache flush sequence!\n");
+                        return; /* pray for the best */
+                }
+        }
+        /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */
+        target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+        printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target);
+        val = be32_to_cpu(readl(target));
+        printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target);
+        target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
+        val = be32_to_cpu(readl(target));
+        printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target);
+        /* 6. invalidate TCE cache */
+        printk(KERN_DEBUG "6. invalidating TCE cache\n");
+        target = calgary_reg(bbar, tar_offset(bus));
+        writeq(tbl->tar_val, target);
+        /* 7. Re-read PMCR */
+        printk(KERN_DEBUG "7a. Re-reading PMCR\n");
+        target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+        val = be32_to_cpu(readl(target));
+        printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target);
+        /* 8. Remove HardStop */
+        printk(KERN_DEBUG "8a. removing HardStop from PMCR\n");
+        target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+        val = 0;
+        printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target);
+        writel(cpu_to_be32(val), target);
+        val = be32_to_cpu(readl(target));
+        printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target);
+}
+static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
+        u64 limit)
+{
+        unsigned int numpages;
+        limit = limit | 0xfffff;
+        limit++;
+        numpages = ((limit - start) >> PAGE_SHIFT);
+        iommu_range_reserve(pci_iommu(dev->bus), start, numpages);
+}
+static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
+{
+        void __iomem *target;
+        u64 low, high, sizelow;
+        u64 start, limit;
+        struct iommu_table *tbl = pci_iommu(dev->bus);
+        unsigned char busnum = dev->bus->number;
+        void __iomem *bbar = tbl->bbar;
+        /* peripheral MEM_1 region */
+        target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
+        low = be32_to_cpu(readl(target));
+        target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
+        high = be32_to_cpu(readl(target));
+        target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
+        sizelow = be32_to_cpu(readl(target));
+        start = (high << 32) | low;
+        limit = sizelow;
+        calgary_reserve_mem_region(dev, start, limit);
+}
+static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
+{
+        void __iomem *target;
+        u32 val32;
+        u64 low, high, sizelow, sizehigh;
+        u64 start, limit;
+        struct iommu_table *tbl = pci_iommu(dev->bus);
+        unsigned char busnum = dev->bus->number;
+        void __iomem *bbar = tbl->bbar;
+        /* is it enabled? */
+        target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+        val32 = be32_to_cpu(readl(target));
+        if (!(val32 & PHB_MEM2_ENABLE))
+                return;
+        target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
+        low = be32_to_cpu(readl(target));
+        target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
+        high = be32_to_cpu(readl(target));
+        target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
+        sizelow = be32_to_cpu(readl(target));
+        target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
+        sizehigh = be32_to_cpu(readl(target));
+        start = (high << 32) | low;
+        limit = (sizehigh << 32) | sizelow;
+        calgary_reserve_mem_region(dev, start, limit);
+}
+/*
+ * some regions of the IO address space do not get translated, so we
+ * must not give devices IO addresses in those regions. The regions
+ * are the 640KB-1MB region and the two PCI peripheral memory holes.
+ * Reserve all of them in the IOMMU bitmap to avoid giving them out
+ * later.
+ */
+static void __init calgary_reserve_regions(struct pci_dev *dev)
+{
+        unsigned int npages;
+        u64 start;
+        struct iommu_table *tbl = pci_iommu(dev->bus);
+        /* reserve EMERGENCY_PAGES from bad_dma_address and up */
+        iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
+        /* avoid the BIOS/VGA first 640KB-1MB region */
+        /* for CalIOC2 - avoid the entire first MB */
+        if (is_calgary(dev->device)) {
+                start = (640 * 1024);
+                npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
+        } else { /* calioc2 */
+                start = 0;
+                npages = (1 * 1024 * 1024) >> PAGE_SHIFT;
+        }
+        iommu_range_reserve(tbl, start, npages);
+        /* reserve the two PCI peripheral memory regions in IO space */
+        calgary_reserve_peripheral_mem_1(dev);
+        calgary_reserve_peripheral_mem_2(dev);
+}
+static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
+{
+        u64 val64;
+        u64 table_phys;
+        void __iomem *target;
+        int ret;
+        struct iommu_table *tbl;
+        /* build TCE tables for each PHB */
+        ret = build_tce_table(dev, bbar);
+        if (ret)
+                return ret;
+        tbl = pci_iommu(dev->bus);
+        tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
+        tce_free(tbl, 0, tbl->it_size);
+        if (is_calgary(dev->device))
+                tbl->chip_ops = &calgary_chip_ops;
+        else if (is_calioc2(dev->device))
+                tbl->chip_ops = &calioc2_chip_ops;
+        else
+                BUG();
+        calgary_reserve_regions(dev);
+        /* set TARs for each PHB */
+        target = calgary_reg(bbar, tar_offset(dev->bus->number));
+        val64 = be64_to_cpu(readq(target));
+        /* zero out all TAR bits under sw control */
+        val64 &= ~TAR_SW_BITS;
+        table_phys = (u64)__pa(tbl->it_base);
+        val64 |= table_phys;
+        BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
+        val64 |= (u64) specified_table_size;
+        tbl->tar_val = cpu_to_be64(val64);
+        writeq(tbl->tar_val, target);
+        readq(target); /* flush */
+        return 0;
+}
+static void __init calgary_free_bus(struct pci_dev *dev)
+{
+        u64 val64;
+        struct iommu_table *tbl = pci_iommu(dev->bus);
+        void __iomem *target;
+        unsigned int bitmapsz;
+        target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
+        val64 = be64_to_cpu(readq(target));
+        val64 &= ~TAR_SW_BITS;
+        writeq(cpu_to_be64(val64), target);
+        readq(target); /* flush */
+        bitmapsz = tbl->it_size / BITS_PER_BYTE;
+        free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
+        tbl->it_map = NULL;
+        kfree(tbl);
+        
+        set_pci_iommu(dev->bus, NULL);
+        /* Can't free bootmem allocated memory after system is up :-( */
+        bus_info[dev->bus->number].tce_space = NULL;
+}
+static void calgary_dump_error_regs(struct iommu_table *tbl)
+{
+        void __iomem *bbar = tbl->bbar;
+        void __iomem *target;
+        u32 csr, plssr;
+        target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
+        csr = be32_to_cpu(readl(target));
+        target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
+        plssr = be32_to_cpu(readl(target));
+        /* If no error, the agent ID in the CSR is not valid */
+        printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, "
+               "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr);
+}
+static void calioc2_dump_error_regs(struct iommu_table *tbl)
+{
+        void __iomem *bbar = tbl->bbar;
+        u32 csr, csmr, plssr, mck, rcstat;
+        void __iomem *target;
+        unsigned long phboff = phb_offset(tbl->it_busno);
+        unsigned long erroff;
+        u32 errregs[7];
+        int i;
+        /* dump CSR */
+        target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET);
+        csr = be32_to_cpu(readl(target));
+        /* dump PLSSR */
+        target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET);
+        plssr = be32_to_cpu(readl(target));
+        /* dump CSMR */
+        target = calgary_reg(bbar, phboff | 0x290);
+        csmr = be32_to_cpu(readl(target));
+        /* dump mck */
+        target = calgary_reg(bbar, phboff | 0x800);
+        mck = be32_to_cpu(readl(target));
+        printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n",
+               tbl->it_busno);
+        printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
+               csr, plssr, csmr, mck);
+        /* dump rest of error regs */
+        printk(KERN_EMERG "Calgary: ");
+        for (i = 0; i < ARRAY_SIZE(errregs); i++) {
+                /* err regs are at 0x810 - 0x870 */
+                erroff = (0x810 + (i * 0x10));
+                target = calgary_reg(bbar, phboff | erroff);
+                errregs[i] = be32_to_cpu(readl(target));
+                printk("0x%08x@0x%lx ", errregs[i], erroff);
+        }
+        printk("\n");
+        /* root complex status */
+        target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
+        rcstat = be32_to_cpu(readl(target));
+        printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat,
+               PHB_ROOT_COMPLEX_STATUS);
+}
+static void calgary_watchdog(unsigned long data)
+{
+        struct pci_dev *dev = (struct pci_dev *)data;
+        struct iommu_table *tbl = pci_iommu(dev->bus);
+        void __iomem *bbar = tbl->bbar;
+        u32 val32;
+        void __iomem *target;
+        target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
+        val32 = be32_to_cpu(readl(target));
+        /* If no error, the agent ID in the CSR is not valid */
+        if (val32 & CSR_AGENT_MASK) {
+                tbl->chip_ops->dump_error_regs(tbl);
+                /* reset error */
+                writel(0, target);
+                /* Disable bus that caused the error */
+                target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
+                                     PHB_CONFIG_RW_OFFSET);
+                val32 = be32_to_cpu(readl(target));
+                val32 |= PHB_SLOT_DISABLE;
+                writel(cpu_to_be32(val32), target);
+                readl(target); /* flush */
+        } else {
+                /* Reset the timer */
+                mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
+        }
+}
+static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
+        unsigned char busnum, unsigned long timeout)
+{
+        u64 val64;
+        void __iomem *target;
+        unsigned int phb_shift = ~0; /* silence gcc */
+        u64 mask;
+        switch (busno_to_phbid(busnum)) {
+        case 0: phb_shift = (63 - 19);
+                break;
+        case 1: phb_shift = (63 - 23);
+                break;
+        case 2: phb_shift = (63 - 27);
+                break;
+        case 3: phb_shift = (63 - 35);
+                break;
+        default:
+                BUG_ON(busno_to_phbid(busnum));
+        }
+        target = calgary_reg(bbar, CALGARY_CONFIG_REG);
+        val64 = be64_to_cpu(readq(target));
+        /* zero out this PHB's timer bits */
+        mask = ~(0xFUL << phb_shift);
+        val64 &= mask;
+        val64 |= (timeout << phb_shift);
+        writeq(cpu_to_be64(val64), target);
+        readq(target); /* flush */
+}
+static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
+{
+        unsigned char busnum = dev->bus->number;
+        void __iomem *bbar = tbl->bbar;
+        void __iomem *target;
+        u32 val;
+        /*
+         * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1
+         */
+        target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2);
+        val = cpu_to_be32(readl(target));
+        val |= 0x00800000;
+        writel(cpu_to_be32(val), target);
+}
+static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
+{
+        unsigned char busnum = dev->bus->number;
+        /*
+         * Give split completion a longer timeout on bus 1 for aic94xx
+         * http://bugzilla.kernel.org/show_bug.cgi?id=7180
+         */
+        if (is_calgary(dev->device) && (busnum == 1))
+                calgary_set_split_completion_timeout(tbl->bbar, busnum,
+                                                     CCR_2SEC_TIMEOUT);
+}
+static void __init calgary_enable_translation(struct pci_dev *dev)
+{
+        u32 val32;
+        unsigned char busnum;
+        void __iomem *target;
+        void __iomem *bbar;
+        struct iommu_table *tbl;
+        busnum = dev->bus->number;
+        tbl = pci_iommu(dev->bus);
+        bbar = tbl->bbar;
+        /* enable TCE in PHB Config Register */
+        target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+        val32 = be32_to_cpu(readl(target));
+        val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
+        printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n",
+               (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ?
+               "Calgary" : "CalIOC2", busnum);
+        printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
+               "bus.\n");
+        writel(cpu_to_be32(val32), target);
+        readl(target); /* flush */
+        init_timer(&tbl->watchdog_timer);
+        tbl->watchdog_timer.function = &calgary_watchdog;
+        tbl->watchdog_timer.data = (unsigned long)dev;
+        mod_timer(&tbl->watchdog_timer, jiffies);
+}
+static void __init calgary_disable_translation(struct pci_dev *dev)
+{
+        u32 val32;
+        unsigned char busnum;
+        void __iomem *target;
+        void __iomem *bbar;
+        struct iommu_table *tbl;
+        busnum = dev->bus->number;
+        tbl = pci_iommu(dev->bus);
+        bbar = tbl->bbar;
+        /* disable TCE in PHB Config Register */
+        target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+        val32 = be32_to_cpu(readl(target));
+        val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
+        printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum);
+        writel(cpu_to_be32(val32), target);
+        readl(target); /* flush */
+        del_timer_sync(&tbl->watchdog_timer);
+}
+static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
+{
+        pci_dev_get(dev);
+        set_pci_iommu(dev->bus, NULL);
+        /* is the device behind a bridge? */
+        if (dev->bus->parent)
+                dev->bus->parent->self = dev;
+        else
+                dev->bus->self = dev;
+}
+static int __init calgary_init_one(struct pci_dev *dev)
+{
+        void __iomem *bbar;
+        struct iommu_table *tbl;
+        int ret;
+        BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
+        bbar = busno_to_bbar(dev->bus->number);
+        ret = calgary_setup_tar(dev, bbar);
+        if (ret)
+                goto done;
+        pci_dev_get(dev);
+        if (dev->bus->parent) {
+                if (dev->bus->parent->self)
+                        printk(KERN_WARNING "Calgary: IEEEE, dev %p has "
+                               "bus->parent->self!\n", dev);
+                dev->bus->parent->self = dev;
+        } else
+                dev->bus->self = dev;
+        tbl = pci_iommu(dev->bus);
+        tbl->chip_ops->handle_quirks(tbl, dev);
+        calgary_enable_translation(dev);
+        return 0;
+done:
+        return ret;
+}
+static int __init calgary_locate_bbars(void)
+{
+        int ret;
+        int rioidx, phb, bus;
+        void __iomem *bbar;
+        void __iomem *target;
+        unsigned long offset;
+        u8 start_bus, end_bus;
+        u32 val;
+        ret = -ENODATA;
+        for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
+                struct rio_detail *rio = rio_devs[rioidx];
+                if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
+                        continue;
+                /* map entire 1MB of Calgary config space */
+                bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
+                if (!bbar)
+                        goto error;
+                for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
+                        offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
+                        target = calgary_reg(bbar, offset);
+                        val = be32_to_cpu(readl(target));
+                        start_bus = (u8)((val & 0x00FF0000) >> 16);
+                        end_bus = (u8)((val & 0x0000FF00) >> 8);
+                        if (end_bus) {
+                                for (bus = start_bus; bus <= end_bus; bus++) {
+                                        bus_info[bus].bbar = bbar;
+                                        bus_info[bus].phbid = phb;
+                                }
+                        } else {
+                                bus_info[start_bus].bbar = bbar;
+                                bus_info[start_bus].phbid = phb;
+                        }
+                }
+        }
+        return 0;
+error:
+        /* scan bus_info and iounmap any bbars we previously ioremap'd */
+        for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
+                if (bus_info[bus].bbar)
+                        iounmap(bus_info[bus].bbar);
+        return ret;
+}
+static int __init calgary_init(void)
+{
+        int ret;
+        struct pci_dev *dev = NULL;
+        void *tce_space;
+        ret = calgary_locate_bbars();
+        if (ret)
+                return ret;
+        do {
+                dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
+                if (!dev)
+                        break;
+                if (!is_cal_pci_dev(dev->device))
+                        continue;
+                if (!translate_phb(dev)) {
+                        calgary_init_one_nontraslated(dev);
+                        continue;
+                }
+                tce_space = bus_info[dev->bus->number].tce_space;
+                if (!tce_space && !translate_empty_slots)
+                        continue;
+                ret = calgary_init_one(dev);
+                if (ret)
+                        goto error;
+        } while (1);
+        return ret;
+error:
+        do {
+                dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM,
+                                             PCI_ANY_ID, dev);
+                if (!dev)
+                        break;
+                if (!is_cal_pci_dev(dev->device))
+                        continue;
+                if (!translate_phb(dev)) {
+                        pci_dev_put(dev);
+                        continue;
+                }
+                if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
+                        continue;
+                calgary_disable_translation(dev);
+                calgary_free_bus(dev);
+                pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
+        } while (1);
+        return ret;
+}
+static inline int __init determine_tce_table_size(u64 ram)
+{
+        int ret;
+        if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
+                return specified_table_size;
+        /*
+         * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
+         * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
+         * larger table size has twice as many entries, so shift the
+         * max ram address by 13 to divide by 8K and then look at the
+         * order of the result to choose between 0-7.
+         */
+        ret = get_order(ram >> 13);
+        if (ret > TCE_TABLE_SIZE_8M)
+                ret = TCE_TABLE_SIZE_8M;
+        return ret;
+}
+static int __init build_detail_arrays(void)
+{
+        unsigned long ptr;
+        int i, scal_detail_size, rio_detail_size;
+        if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
+                printk(KERN_WARNING
+                        "Calgary: MAX_NUMNODES too low! Defined as %d, "
+                        "but system has %d nodes.\n",
+                        MAX_NUMNODES, rio_table_hdr->num_scal_dev);
+                return -ENODEV;
+        }
+        switch (rio_table_hdr->version){
+        case 2:
+                scal_detail_size = 11;
+                rio_detail_size = 13;
+                break;
+        case 3:
+                scal_detail_size = 12;
+                rio_detail_size = 15;
+                break;
+        default:
+                printk(KERN_WARNING
+                       "Calgary: Invalid Rio Grande Table Version: %d\n",
+                       rio_table_hdr->version);
+                return -EPROTO;
+        }
+        ptr = ((unsigned long)rio_table_hdr) + 3;
+        for (i = 0; i < rio_table_hdr->num_scal_dev;
+                    i++, ptr += scal_detail_size)
+                scal_devs[i] = (struct scal_detail *)ptr;
+        for (i = 0; i < rio_table_hdr->num_rio_dev;
+                    i++, ptr += rio_detail_size)
+                rio_devs[i] = (struct rio_detail *)ptr;
+        return 0;
+}
+static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
+{
+        int dev;
+        u32 val;
+        if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
+                /*
+                 * FIXME: properly scan for devices accross the
+                 * PCI-to-PCI bridge on every CalIOC2 port.
+                 */
+                return 1;
+        }
+        for (dev = 1; dev < 8; dev++) {
+                val = read_pci_config(bus, dev, 0, 0);
+                if (val != 0xffffffff)
+                        break;
+        }
+        return (val != 0xffffffff);
+}
+void __init detect_calgary(void)
+{
+        int bus;
+        void *tbl;
+        int calgary_found = 0;
+        unsigned long ptr;
+        unsigned int offset, prev_offset;
+        int ret;
+        /*
+         * if the user specified iommu=off or iommu=soft or we found
+         * another HW IOMMU already, bail out.
+         */
+        if (swiotlb || no_iommu || iommu_detected)
+                return;
+        if (!use_calgary)
+                return;
+        if (!early_pci_allowed())
+                return;
+        printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
+        ptr = (unsigned long)phys_to_virt(get_bios_ebda());
+        rio_table_hdr = NULL;
+        prev_offset = 0;
+        offset = 0x180;
+        /*
+         * The next offset is stored in the 1st word.
+         * Only parse up until the offset increases:
+         */
+        while (offset > prev_offset) {
+                /* The block id is stored in the 2nd word */
+                if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
+                        /* set the pointer past the offset & block id */
+                        rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
+                        break;
+                }
+                prev_offset = offset;
+                offset = *((unsigned short *)(ptr + offset));
+        }
+        if (!rio_table_hdr) {
+                printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
+                       "in EBDA - bailing!\n");
+                return;
+        }
+        ret = build_detail_arrays();
+        if (ret) {
+                printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
+                return;
+        }
+        specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
+        for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
+                struct calgary_bus_info *info = &bus_info[bus];
+                unsigned short pci_device;
+                u32 val;
+                val = read_pci_config(bus, 0, 0, 0);
+                pci_device = (val & 0xFFFF0000) >> 16;
+                if (!is_cal_pci_dev(pci_device))
+                        continue;
+                if (info->translation_disabled)
+                        continue;
+                if (calgary_bus_has_devices(bus, pci_device) ||
+                    translate_empty_slots) {
+                        tbl = alloc_tce_table();
+                        if (!tbl)
+                                goto cleanup;
+                        info->tce_space = tbl;
+                        calgary_found = 1;
+                }
+        }
+        printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n",
+               calgary_found ? "found" : "not found");
+        if (calgary_found) {
+                iommu_detected = 1;
+                calgary_detected = 1;
+                printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
+                printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
+                       "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
+                       debugging ? "enabled" : "disabled");
+        }
+        return;
+cleanup:
+        for (--bus; bus >= 0; --bus) {
+                struct calgary_bus_info *info = &bus_info[bus];
+                if (info->tce_space)
+                        free_tce_table(info->tce_space);
+        }
+}
+int __init calgary_iommu_init(void)
+{
+        int ret;
+        if (no_iommu || swiotlb)
+                return -ENODEV;
+        if (!calgary_detected)
+                return -ENODEV;
+        /* ok, we're trying to use Calgary - let's roll */
+        printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
+        ret = calgary_init();
+        if (ret) {
+                printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
+                       "falling back to no_iommu\n", ret);
+                if (end_pfn > MAX_DMA32_PFN)
+                        printk(KERN_ERR "WARNING more than 4GB of memory, "
+                                        "32bit PCI may malfunction.\n");
+                return ret;
+        }
+        force_iommu = 1;
+        bad_dma_address = 0x0;
+        dma_ops = &calgary_dma_ops;
+        return 0;
+}
+static int __init calgary_parse_options(char *p)
+{
+        unsigned int bridge;
+        size_t len;
+        char* endp;
+        while (*p) {
+                if (!strncmp(p, "64k", 3))
+                        specified_table_size = TCE_TABLE_SIZE_64K;
+                else if (!strncmp(p, "128k", 4))
+                        specified_table_size = TCE_TABLE_SIZE_128K;
+                else if (!strncmp(p, "256k", 4))
+                        specified_table_size = TCE_TABLE_SIZE_256K;
+                else if (!strncmp(p, "512k", 4))
+                        specified_table_size = TCE_TABLE_SIZE_512K;
+                else if (!strncmp(p, "1M", 2))
+                        specified_table_size = TCE_TABLE_SIZE_1M;
+                else if (!strncmp(p, "2M", 2))
+                        specified_table_size = TCE_TABLE_SIZE_2M;
+                else if (!strncmp(p, "4M", 2))
+                        specified_table_size = TCE_TABLE_SIZE_4M;
+                else if (!strncmp(p, "8M", 2))
+                        specified_table_size = TCE_TABLE_SIZE_8M;
+                len = strlen("translate_empty_slots");
+                if (!strncmp(p, "translate_empty_slots", len))
+                        translate_empty_slots = 1;
+                len = strlen("disable");
+                if (!strncmp(p, "disable", len)) {
+                        p += len;
+                        if (*p == '=')
+                                ++p;
+                        if (*p == '\0')
+                                break;
+                        bridge = simple_strtol(p, &endp, 0);
+                        if (p == endp)
+                                break;
+                        if (bridge < MAX_PHB_BUS_NUM) {
+                                printk(KERN_INFO "Calgary: disabling "
+                                       "translation for PHB %#x\n", bridge);
+                                bus_info[bridge].translation_disabled = 1;
+                        }
+                }
+                p = strpbrk(p, ",");
+                if (!p)
+                        break;
+                p++; /* skip ',' */
+        }
+        return 1;
+}
+__setup("calgary=", calgary_parse_options);
+static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
+{
+        struct iommu_table *tbl;
+        unsigned int npages;
+        int i;
+        tbl = pci_iommu(dev->bus);
+        for (i = 0; i < 4; i++) {
+                struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i];
+                /* Don't give out TCEs that map MEM resources */
+                if (!(r->flags & IORESOURCE_MEM))
+                        continue;
+                /* 0-based? we reserve the whole 1st MB anyway */
+                if (!r->start)
+                        continue;
+                /* cover the whole region */
+                npages = (r->end - r->start) >> PAGE_SHIFT;
+                npages++;
+                iommu_range_reserve(tbl, r->start, npages);
+        }
+}
+static int __init calgary_fixup_tce_spaces(void)
+{
+        struct pci_dev *dev = NULL;
+        void *tce_space;
+        if (no_iommu || swiotlb || !calgary_detected)
+                return -ENODEV;
+        printk(KERN_DEBUG "Calgary: fixing up tce spaces\n");
+        do {
+                dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
+                if (!dev)
+                        break;
+                if (!is_cal_pci_dev(dev->device))
+                        continue;
+                if (!translate_phb(dev))
+                        continue;
+                tce_space = bus_info[dev->bus->number].tce_space;
+                if (!tce_space)
+                        continue;
+                calgary_fixup_one_tce_space(dev);
+        } while (1);
+        return 0;
+}
+/*
+ * We need to be call after pcibios_assign_resources (fs_initcall level)
+ * and before device_initcall.
+ */
+rootfs_initcall(calgary_fixup_tce_spaces);
diff --git a/arch/x86/kernel/pci-dma_32.c b/arch/x86/kernel/pci-dma_32.c
new file mode 100644
index 000000000000..048f09b62553
--- /dev/null
+++ b/arch/x86/kernel/pci-dma_32.c
@@ -0,0 +1,177 @@
+/*
+ * Dynamic DMA mapping support.
+ *
+ * On i386 there is no hardware dynamic DMA address translation,
+ * so consistent alloc/free are merely page allocation/freeing.
+ * The rest of the dynamic DMA mapping interface is implemented
+ * in asm/pci.h.
+ */
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <asm/io.h>
+struct dma_coherent_mem {
+        void            *virt_base;
+        u32             device_base;
+        int             size;
+        int             flags;
+        unsigned long   *bitmap;
+};
+void *dma_alloc_coherent(struct device *dev, size_t size,
+                           dma_addr_t *dma_handle, gfp_t gfp)
+{
+        void *ret;
+        struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+        int order = get_order(size);
+        /* ignore region specifiers */
+        gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
+        if (mem) {
+                int page = bitmap_find_free_region(mem->bitmap, mem->size,
+                                                     order);
+                if (page >= 0) {
+                        *dma_handle = mem->device_base + (page << PAGE_SHIFT);
+                        ret = mem->virt_base + (page << PAGE_SHIFT);
+                        memset(ret, 0, size);
+                        return ret;
+                }
+                if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+                        return NULL;
+        }
+        if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
+                gfp |= GFP_DMA;
+        ret = (void *)__get_free_pages(gfp, order);
+        if (ret != NULL) {
+                memset(ret, 0, size);
+                *dma_handle = virt_to_phys(ret);
+        }
+        return ret;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+void dma_free_coherent(struct device *dev, size_t size,
+                         void *vaddr, dma_addr_t dma_handle)
+{
+        struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+        int order = get_order(size);
+        
+        if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+                int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+                bitmap_release_region(mem->bitmap, page, order);
+        } else
+                free_pages((unsigned long)vaddr, order);
+}
+EXPORT_SYMBOL(dma_free_coherent);
+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+                                dma_addr_t device_addr, size_t size, int flags)
+{
+        void __iomem *mem_base = NULL;
+        int pages = size >> PAGE_SHIFT;
+        int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+        if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
+                goto out;
+        if (!size)
+                goto out;
+        if (dev->dma_mem)
+                goto out;
+        /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
+        mem_base = ioremap(bus_addr, size);
+        if (!mem_base)
+                goto out;
+        dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+        if (!dev->dma_mem)
+                goto out;
+        dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+        if (!dev->dma_mem->bitmap)
+                goto free1_out;
+        dev->dma_mem->virt_base = mem_base;
+        dev->dma_mem->device_base = device_addr;
+        dev->dma_mem->size = pages;
+        dev->dma_mem->flags = flags;
+        if (flags & DMA_MEMORY_MAP)
+                return DMA_MEMORY_MAP;
+        return DMA_MEMORY_IO;
+ free1_out:
+        kfree(dev->dma_mem);
+ out:
+        if (mem_base)
+                iounmap(mem_base);
+        return 0;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+void dma_release_declared_memory(struct device *dev)
+{
+        struct dma_coherent_mem *mem = dev->dma_mem;
+        
+        if(!mem)
+                return;
+        dev->dma_mem = NULL;
+        iounmap(mem->virt_base);
+        kfree(mem->bitmap);
+        kfree(mem);
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+void *dma_mark_declared_memory_occupied(struct device *dev,
+                                        dma_addr_t device_addr, size_t size)
+{
+        struct dma_coherent_mem *mem = dev->dma_mem;
+        int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        int pos, err;
+        if (!mem)
+                return ERR_PTR(-EINVAL);
+        pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
+        err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
+        if (err != 0)
+                return ERR_PTR(err);
+        return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+#ifdef CONFIG_PCI
+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+int forbid_dac;
+EXPORT_SYMBOL(forbid_dac);
+static __devinit void via_no_dac(struct pci_dev *dev)
+{
+        if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+                printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
+                forbid_dac = 1;
+        }
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+static int check_iommu(char *s)
+{
+        if (!strcmp(s, "usedac")) {
+                forbid_dac = -1;
+                return 1;
+        }
+        return 0;
+}
+__setup("iommu=", check_iommu);
+#endif
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
new file mode 100644
index 000000000000..29711445c818
--- /dev/null
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -0,0 +1,346 @@
+/*
+ * Dynamic DMA mapping support.
+ */
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/calgary.h>
+int iommu_merge __read_mostly = 0;
+EXPORT_SYMBOL(iommu_merge);
+dma_addr_t bad_dma_address __read_mostly;
+EXPORT_SYMBOL(bad_dma_address);
+/* This tells the BIO block layer to assume merging. Default to off
+   because we cannot guarantee merging later. */
+int iommu_bio_merge __read_mostly = 0;
+EXPORT_SYMBOL(iommu_bio_merge);
+static int iommu_sac_force __read_mostly = 0;
+int no_iommu __read_mostly;
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow __read_mostly = 1;
+int force_iommu __read_mostly = 1;
+#else
+int panic_on_overflow __read_mostly = 0;
+int force_iommu __read_mostly= 0;
+#endif
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly = 0;
+/* Dummy device used for NULL arguments (normally ISA). Better would
+   be probably a smaller DMA mask, but this is bug-to-bug compatible
+   to i386. */
+struct device fallback_dev = {
+        .bus_id = "fallback device",
+        .coherent_dma_mask = DMA_32BIT_MASK,
+        .dma_mask = &fallback_dev.coherent_dma_mask,
+};
+/* Allocate DMA memory on node near device */
+noinline static void *
+dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
+{
+        struct page *page;
+        int node;
+#ifdef CONFIG_PCI
+        if (dev->bus == &pci_bus_type)
+                node = pcibus_to_node(to_pci_dev(dev)->bus);
+        else
+#endif
+                node = numa_node_id();
+        if (node < first_node(node_online_map))
+                node = first_node(node_online_map);
+        page = alloc_pages_node(node, gfp, order);
+        return page ? page_address(page) : NULL;
+}
+/*
+ * Allocate memory for a coherent mapping.
+ */
+void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+                   gfp_t gfp)
+{
+        void *memory;
+        unsigned long dma_mask = 0;
+        u64 bus;
+        if (!dev)
+                dev = &fallback_dev;
+        dma_mask = dev->coherent_dma_mask;
+        if (dma_mask == 0)
+                dma_mask = DMA_32BIT_MASK;
+        /* Device not DMA able */
+        if (dev->dma_mask == NULL)
+                return NULL;
+        /* Don't invoke OOM killer */
+        gfp |= __GFP_NORETRY;
+        /* Kludge to make it bug-to-bug compatible with i386. i386
+           uses the normal dma_mask for alloc_coherent. */
+        dma_mask &= *dev->dma_mask;
+        /* Why <=? Even when the mask is smaller than 4GB it is often
+           larger than 16MB and in this case we have a chance of
+           finding fitting memory in the next higher zone first. If
+           not retry with true GFP_DMA. -AK */
+        if (dma_mask <= DMA_32BIT_MASK)
+                gfp |= GFP_DMA32;
+ again:
+        memory = dma_alloc_pages(dev, gfp, get_order(size));
+        if (memory == NULL)
+                return NULL;
+        {
+                int high, mmu;
+                bus = virt_to_bus(memory);
+                high = (bus + size) >= dma_mask;
+                mmu = high;
+                if (force_iommu && !(gfp & GFP_DMA))
+                        mmu = 1;
+                else if (high) {
+                        free_pages((unsigned long)memory,
+                                   get_order(size));
+                        /* Don't use the 16MB ZONE_DMA unless absolutely
+                           needed. It's better to use remapping first. */
+                        if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
+                                gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
+                                goto again;
+                        }
+                        /* Let low level make its own zone decisions */
+                        gfp &= ~(GFP_DMA32|GFP_DMA);
+                        if (dma_ops->alloc_coherent)
+                                return dma_ops->alloc_coherent(dev, size,
+                                                           dma_handle, gfp);
+                        return NULL;
+                }
+                memset(memory, 0, size);
+                if (!mmu) {
+                        *dma_handle = virt_to_bus(memory);
+                        return memory;
+                }
+        }
+        if (dma_ops->alloc_coherent) {
+                free_pages((unsigned long)memory, get_order(size));
+                gfp &= ~(GFP_DMA|GFP_DMA32);
+                return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
+        }
+        if (dma_ops->map_simple) {
+                *dma_handle = dma_ops->map_simple(dev, memory,
+                                              size,
+                                              PCI_DMA_BIDIRECTIONAL);
+                if (*dma_handle != bad_dma_address)
+                        return memory;
+        }
+        if (panic_on_overflow)
+                panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size);
+        free_pages((unsigned long)memory, get_order(size));
+        return NULL;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+/*
+ * Unmap coherent memory.
+ * The caller must ensure that the device has finished accessing the mapping.
+ */
+void dma_free_coherent(struct device *dev, size_t size,
+                         void *vaddr, dma_addr_t bus)
+{
+        if (dma_ops->unmap_single)
+                dma_ops->unmap_single(dev, bus, size, 0);
+        free_pages((unsigned long)vaddr, get_order(size));
+}
+EXPORT_SYMBOL(dma_free_coherent);
+static int forbid_dac __read_mostly;
+int dma_supported(struct device *dev, u64 mask)
+{
+#ifdef CONFIG_PCI
+        if (mask > 0xffffffff && forbid_dac > 0) {
+                printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id);
+                return 0;
+        }
+#endif
+        if (dma_ops->dma_supported)
+                return dma_ops->dma_supported(dev, mask);
+        /* Copied from i386. Doesn't make much sense, because it will
+           only work for pci_alloc_coherent.
+           The caller just has to use GFP_DMA in this case. */
+        if (mask < DMA_24BIT_MASK)
+                return 0;
+        /* Tell the device to use SAC when IOMMU force is on.  This
+           allows the driver to use cheaper accesses in some cases.
+           Problem with this is that if we overflow the IOMMU area and
+           return DAC as fallback address the device may not handle it
+           correctly.
+           As a special case some controllers have a 39bit address
+           mode that is as efficient as 32bit (aic79xx). Don't force
+           SAC for these.  Assume all masks <= 40 bits are of this
+           type. Normally this doesn't make any difference, but gives
+           more gentle handling of IOMMU overflow. */
+        if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
+                printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
+                return 0;
+        }
+        return 1;
+}
+EXPORT_SYMBOL(dma_supported);
+int dma_set_mask(struct device *dev, u64 mask)
+{
+        if (!dev->dma_mask || !dma_supported(dev, mask))
+                return -EIO;
+        *dev->dma_mask = mask;
+        return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+/*
+ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
+ * documentation.
+ */
+__init int iommu_setup(char *p)
+{
+        iommu_merge = 1;
+        if (!p)
+                return -EINVAL;
+        while (*p) {
+                if (!strncmp(p,"off",3))
+                        no_iommu = 1;
+                /* gart_parse_options has more force support */
+                if (!strncmp(p,"force",5))
+                        force_iommu = 1;
+                if (!strncmp(p,"noforce",7)) {
+                        iommu_merge = 0;
+                        force_iommu = 0;
+                }
+                if (!strncmp(p, "biomerge",8)) {
+                        iommu_bio_merge = 4096;
+                        iommu_merge = 1;
+                        force_iommu = 1;
+                }
+                if (!strncmp(p, "panic",5))
+                        panic_on_overflow = 1;
+                if (!strncmp(p, "nopanic",7))
+                        panic_on_overflow = 0;
+                if (!strncmp(p, "merge",5)) {
+                        iommu_merge = 1;
+                        force_iommu = 1;
+                }
+                if (!strncmp(p, "nomerge",7))
+                        iommu_merge = 0;
+                if (!strncmp(p, "forcesac",8))
+                        iommu_sac_force = 1;
+                if (!strncmp(p, "allowdac", 8))
+                        forbid_dac = 0;
+                if (!strncmp(p, "nodac", 5))
+                        forbid_dac = -1;
+#ifdef CONFIG_SWIOTLB
+                if (!strncmp(p, "soft",4))
+                        swiotlb = 1;
+#endif
+#ifdef CONFIG_IOMMU
+                gart_parse_options(p);
+#endif
+#ifdef CONFIG_CALGARY_IOMMU
+                if (!strncmp(p, "calgary", 7))
+                        use_calgary = 1;
+#endif /* CONFIG_CALGARY_IOMMU */
+                p += strcspn(p, ",");
+                if (*p == ',')
+                        ++p;
+        }
+        return 0;
+}
+early_param("iommu", iommu_setup);
+void __init pci_iommu_alloc(void)
+{
+        /*
+         * The order of these functions is important for
+         * fall-back/fail-over reasons
+         */
+#ifdef CONFIG_IOMMU
+        iommu_hole_init();
+#endif
+#ifdef CONFIG_CALGARY_IOMMU
+        detect_calgary();
+#endif
+#ifdef CONFIG_SWIOTLB
+        pci_swiotlb_init();
+#endif
+}
+static int __init pci_iommu_init(void)
+{
+#ifdef CONFIG_CALGARY_IOMMU
+        calgary_iommu_init();
+#endif
+#ifdef CONFIG_IOMMU
+        gart_iommu_init();
+#endif
+        no_iommu_init();
+        return 0;
+}
+void pci_iommu_shutdown(void)
+{
+        gart_iommu_shutdown();
+}
+#ifdef CONFIG_PCI
+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+static __devinit void via_no_dac(struct pci_dev *dev)
+{
+        if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+                printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
+                forbid_dac = 1;
+        }
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+#endif
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
new file mode 100644
index 000000000000..4918c575d582
--- /dev/null
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -0,0 +1,740 @@
+/*
+ * Dynamic DMA mapping support for AMD Hammer.
+ * 
+ * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
+ * This allows to use PCI devices that only support 32bit addresses on systems
+ * with more than 4GB. 
+ *
+ * See Documentation/DMA-mapping.txt for the interface specification.
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/agp_backend.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <linux/kdebug.h>
+#include <asm/atomic.h>
+#include <asm/io.h>
+#include <asm/mtrr.h>
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/iommu.h>
+#include <asm/cacheflush.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/k8.h>
+unsigned long iommu_bus_base;   /* GART remapping area (physical) */
+static unsigned long iommu_size;        /* size of remapping area bytes */
+static unsigned long iommu_pages;       /* .. and in pages */
+u32 *iommu_gatt_base;           /* Remapping table */
+/* If this is disabled the IOMMU will use an optimized flushing strategy
+   of only flushing when an mapping is reused. With it true the GART is flushed 
+   for every mapping. Problem is that doing the lazy flush seems to trigger
+   bugs with some popular PCI cards, in particular 3ware (but has been also
+   also seen with Qlogic at least). */
+int iommu_fullflush = 1;
+/* Allocation bitmap for the remapping area */ 
+static DEFINE_SPINLOCK(iommu_bitmap_lock);
+static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
+static u32 gart_unmapped_entry; 
+#define GPTE_VALID    1
+#define GPTE_COHERENT 2
+#define GPTE_ENCODE(x) \
+        (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
+#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
+#define to_pages(addr,size) \
+        (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
+#define EMERGENCY_PAGES 32 /* = 128KB */ 
+#ifdef CONFIG_AGP
+#define AGPEXTERN extern
+#else
+#define AGPEXTERN
+#endif
+/* backdoor interface to AGP driver */
+AGPEXTERN int agp_memory_reserved;
+AGPEXTERN __u32 *agp_gatt_table;
+static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
+static int need_flush;          /* global flush state. set for each gart wrap */
+static unsigned long alloc_iommu(int size) 
+{       
+        unsigned long offset, flags;
+        spin_lock_irqsave(&iommu_bitmap_lock, flags);   
+        offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
+        if (offset == -1) {
+                need_flush = 1;
+                offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size);
+        }
+        if (offset != -1) { 
+                set_bit_string(iommu_gart_bitmap, offset, size); 
+                next_bit = offset+size; 
+                if (next_bit >= iommu_pages) { 
+                        next_bit = 0;
+                        need_flush = 1;
+                } 
+        } 
+        if (iommu_fullflush)
+                need_flush = 1;
+        spin_unlock_irqrestore(&iommu_bitmap_lock, flags);      
+        return offset;
+} 
+static void free_iommu(unsigned long offset, int size)
+{ 
+        unsigned long flags;
+        spin_lock_irqsave(&iommu_bitmap_lock, flags);
+        __clear_bit_string(iommu_gart_bitmap, offset, size);
+        spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+} 
+/* 
+ * Use global flush state to avoid races with multiple flushers.
+ */
+static void flush_gart(void)
+{ 
+        unsigned long flags;
+        spin_lock_irqsave(&iommu_bitmap_lock, flags);
+        if (need_flush) {
+                k8_flush_garts();
+                need_flush = 0;
+        } 
+        spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+} 
+#ifdef CONFIG_IOMMU_LEAK
+#define SET_LEAK(x) if (iommu_leak_tab) \
+                        iommu_leak_tab[x] = __builtin_return_address(0);
+#define CLEAR_LEAK(x) if (iommu_leak_tab) \
+                        iommu_leak_tab[x] = NULL;
+/* Debugging aid for drivers that don't free their IOMMU tables */
+static void **iommu_leak_tab; 
+static int leak_trace;
+int iommu_leak_pages = 20; 
+void dump_leak(void)
+{
+        int i;
+        static int dump; 
+        if (dump || !iommu_leak_tab) return;
+        dump = 1;
+        show_stack(NULL,NULL);
+        /* Very crude. dump some from the end of the table too */ 
+        printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); 
+        for (i = 0; i < iommu_leak_pages; i+=2) {
+                printk("%lu: ", iommu_pages-i);
+                printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
+                printk("%c", (i+1)%2 == 0 ? '\n' : ' '); 
+        } 
+        printk("\n");
+}
+#else
+#define SET_LEAK(x)
+#define CLEAR_LEAK(x)
+#endif
+static void iommu_full(struct device *dev, size_t size, int dir)
+{
+        /* 
+         * Ran out of IOMMU space for this operation. This is very bad.
+         * Unfortunately the drivers cannot handle this operation properly.
+         * Return some non mapped prereserved space in the aperture and 
+         * let the Northbridge deal with it. This will result in garbage
+         * in the IO operation. When the size exceeds the prereserved space
+         * memory corruption will occur or random memory will be DMAed 
+         * out. Hopefully no network devices use single mappings that big.
+         */ 
+        
+        printk(KERN_ERR 
+  "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
+               size, dev->bus_id);
+        if (size > PAGE_SIZE*EMERGENCY_PAGES) {
+                if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+                        panic("PCI-DMA: Memory would be corrupted\n");
+                if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) 
+                        panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n");
+        } 
+#ifdef CONFIG_IOMMU_LEAK
+        dump_leak(); 
+#endif
+} 
+static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
+{ 
+        u64 mask = *dev->dma_mask;
+        int high = addr + size > mask;
+        int mmu = high;
+        if (force_iommu) 
+                mmu = 1; 
+        return mmu; 
+}
+static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
+{ 
+        u64 mask = *dev->dma_mask;
+        int high = addr + size > mask;
+        int mmu = high;
+        return mmu; 
+}
+/* Map a single continuous physical area into the IOMMU.
+ * Caller needs to check if the iommu is needed and flush.
+ */
+static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
+                                size_t size, int dir)
+{ 
+        unsigned long npages = to_pages(phys_mem, size);
+        unsigned long iommu_page = alloc_iommu(npages);
+        int i;
+        if (iommu_page == -1) {
+                if (!nonforced_iommu(dev, phys_mem, size))
+                        return phys_mem; 
+                if (panic_on_overflow)
+                        panic("dma_map_area overflow %lu bytes\n", size);
+                iommu_full(dev, size, dir);
+                return bad_dma_address;
+        }
+        for (i = 0; i < npages; i++) {
+                iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
+                SET_LEAK(iommu_page + i);
+                phys_mem += PAGE_SIZE;
+        }
+        return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
+}
+static dma_addr_t gart_map_simple(struct device *dev, char *buf,
+                                 size_t size, int dir)
+{
+        dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
+        flush_gart();
+        return map;
+}
+/* Map a single area into the IOMMU */
+static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
+{
+        unsigned long phys_mem, bus;
+        if (!dev)
+                dev = &fallback_dev;
+        phys_mem = virt_to_phys(addr); 
+        if (!need_iommu(dev, phys_mem, size))
+                return phys_mem; 
+        bus = gart_map_simple(dev, addr, size, dir);
+        return bus; 
+}
+/*
+ * Free a DMA mapping.
+ */
+static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
+                      size_t size, int direction)
+{
+        unsigned long iommu_page;
+        int npages;
+        int i;
+        if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
+            dma_addr >= iommu_bus_base + iommu_size)
+                return;
+        iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
+        npages = to_pages(dma_addr, size);
+        for (i = 0; i < npages; i++) {
+                iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
+                CLEAR_LEAK(iommu_page + i);
+        }
+        free_iommu(iommu_page, npages);
+}
+/*
+ * Wrapper for pci_unmap_single working with scatterlists.
+ */
+static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+{
+        int i;
+        for (i = 0; i < nents; i++) {
+                struct scatterlist *s = &sg[i];
+                if (!s->dma_length || !s->length)
+                        break;
+                gart_unmap_single(dev, s->dma_address, s->dma_length, dir);
+        }
+}
+/* Fallback for dma_map_sg in case of overflow */
+static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
+                               int nents, int dir)
+{
+        int i;
+#ifdef CONFIG_IOMMU_DEBUG
+        printk(KERN_DEBUG "dma_map_sg overflow\n");
+#endif
+        for (i = 0; i < nents; i++ ) {
+                struct scatterlist *s = &sg[i];
+                unsigned long addr = page_to_phys(s->page) + s->offset; 
+                if (nonforced_iommu(dev, addr, s->length)) { 
+                        addr = dma_map_area(dev, addr, s->length, dir);
+                        if (addr == bad_dma_address) { 
+                                if (i > 0) 
+                                        gart_unmap_sg(dev, sg, i, dir);
+                                nents = 0; 
+                                sg[0].dma_length = 0;
+                                break;
+                        }
+                }
+                s->dma_address = addr;
+                s->dma_length = s->length;
+        }
+        flush_gart();
+        return nents;
+}
+/* Map multiple scatterlist entries continuous into the first. */
+static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
+                      struct scatterlist *sout, unsigned long pages)
+{
+        unsigned long iommu_start = alloc_iommu(pages);
+        unsigned long iommu_page = iommu_start; 
+        int i;
+        if (iommu_start == -1)
+                return -1;
+        
+        for (i = start; i < stopat; i++) {
+                struct scatterlist *s = &sg[i];
+                unsigned long pages, addr;
+                unsigned long phys_addr = s->dma_address;
+                
+                BUG_ON(i > start && s->offset);
+                if (i == start) {
+                        *sout = *s; 
+                        sout->dma_address = iommu_bus_base;
+                        sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
+                        sout->dma_length = s->length;
+                } else { 
+                        sout->dma_length += s->length; 
+                }
+                addr = phys_addr;
+                pages = to_pages(s->offset, s->length); 
+                while (pages--) { 
+                        iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 
+                        SET_LEAK(iommu_page);
+                        addr += PAGE_SIZE;
+                        iommu_page++;
+                }
+        } 
+        BUG_ON(iommu_page - iommu_start != pages);      
+        return 0;
+}
+static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
+                      struct scatterlist *sout,
+                      unsigned long pages, int need)
+{
+        if (!need) { 
+                BUG_ON(stopat - start != 1);
+                *sout = sg[start]; 
+                sout->dma_length = sg[start].length; 
+                return 0;
+        } 
+        return __dma_map_cont(sg, start, stopat, sout, pages);
+}
+                
+/*
+ * DMA map all entries in a scatterlist.
+ * Merge chunks that have page aligned sizes into a continuous mapping. 
+ */
+int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+{
+        int i;
+        int out;
+        int start;
+        unsigned long pages = 0;
+        int need = 0, nextneed;
+        if (nents == 0) 
+                return 0;
+        if (!dev)
+                dev = &fallback_dev;
+        out = 0;
+        start = 0;
+        for (i = 0; i < nents; i++) {
+                struct scatterlist *s = &sg[i];
+                dma_addr_t addr = page_to_phys(s->page) + s->offset;
+                s->dma_address = addr;
+                BUG_ON(s->length == 0); 
+                nextneed = need_iommu(dev, addr, s->length); 
+                /* Handle the previous not yet processed entries */
+                if (i > start) {
+                        struct scatterlist *ps = &sg[i-1];
+                        /* Can only merge when the last chunk ends on a page 
+                           boundary and the new one doesn't have an offset. */
+                        if (!iommu_merge || !nextneed || !need || s->offset ||
+                            (ps->offset + ps->length) % PAGE_SIZE) { 
+                                if (dma_map_cont(sg, start, i, sg+out, pages,
+                                                 need) < 0)
+                                        goto error;
+                                out++;
+                                pages = 0;
+                                start = i;      
+                        }
+                }
+                need = nextneed;
+                pages += to_pages(s->offset, s->length);
+        }
+        if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
+                goto error;
+        out++;
+        flush_gart();
+        if (out < nents) 
+                sg[out].dma_length = 0; 
+        return out;
+error:
+        flush_gart();
+        gart_unmap_sg(dev, sg, nents, dir);
+        /* When it was forced or merged try again in a dumb way */
+        if (force_iommu || iommu_merge) {
+                out = dma_map_sg_nonforce(dev, sg, nents, dir);
+                if (out > 0)
+                        return out;
+        }
+        if (panic_on_overflow)
+                panic("dma_map_sg: overflow on %lu pages\n", pages);
+        iommu_full(dev, pages << PAGE_SHIFT, dir);
+        for (i = 0; i < nents; i++)
+                sg[i].dma_address = bad_dma_address;
+        return 0;
+} 
+static int no_agp;
+static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
+{ 
+        unsigned long a; 
+        if (!iommu_size) { 
+                iommu_size = aper_size; 
+                if (!no_agp) 
+                        iommu_size /= 2; 
+        } 
+        a = aper + iommu_size; 
+        iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
+        if (iommu_size < 64*1024*1024) 
+                printk(KERN_WARNING
+  "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); 
+        
+        return iommu_size;
+} 
+static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) 
+{ 
+        unsigned aper_size = 0, aper_base_32;
+        u64 aper_base;
+        unsigned aper_order;
+        pci_read_config_dword(dev, 0x94, &aper_base_32); 
+        pci_read_config_dword(dev, 0x90, &aper_order);
+        aper_order = (aper_order >> 1) & 7;     
+        aper_base = aper_base_32 & 0x7fff; 
+        aper_base <<= 25;
+        aper_size = (32 * 1024 * 1024) << aper_order; 
+       if (aper_base + aper_size > 0x100000000UL || !aper_size)
+                aper_base = 0;
+        *size = aper_size;
+        return aper_base;
+} 
+/* 
+ * Private Northbridge GATT initialization in case we cannot use the
+ * AGP driver for some reason.  
+ */
+static __init int init_k8_gatt(struct agp_kern_info *info)
+{ 
+        struct pci_dev *dev;
+        void *gatt;
+        unsigned aper_base, new_aper_base;
+        unsigned aper_size, gatt_size, new_aper_size;
+        int i;
+        printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
+        aper_size = aper_base = info->aper_size = 0;
+        dev = NULL;
+        for (i = 0; i < num_k8_northbridges; i++) {
+                dev = k8_northbridges[i];
+                new_aper_base = read_aperture(dev, &new_aper_size); 
+                if (!new_aper_base) 
+                        goto nommu; 
+                
+                if (!aper_base) { 
+                        aper_size = new_aper_size;
+                        aper_base = new_aper_base;
+                } 
+                if (aper_size != new_aper_size || aper_base != new_aper_base) 
+                        goto nommu;
+        }
+        if (!aper_base)
+                goto nommu; 
+        info->aper_base = aper_base;
+        info->aper_size = aper_size>>20; 
+        gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 
+        gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 
+        if (!gatt) 
+                panic("Cannot allocate GATT table");
+        if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE))
+                panic("Could not set GART PTEs to uncacheable pages");
+        global_flush_tlb();
+        memset(gatt, 0, gatt_size); 
+        agp_gatt_table = gatt;
+        for (i = 0; i < num_k8_northbridges; i++) {
+                u32 ctl; 
+                u32 gatt_reg; 
+                dev = k8_northbridges[i];
+                gatt_reg = __pa(gatt) >> 12; 
+                gatt_reg <<= 4; 
+                pci_write_config_dword(dev, 0x98, gatt_reg);
+                pci_read_config_dword(dev, 0x90, &ctl); 
+                ctl |= 1;
+                ctl &= ~((1<<4) | (1<<5));
+                pci_write_config_dword(dev, 0x90, ctl); 
+        }
+        flush_gart();
+        
+        printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 
+        return 0;
+ nommu:
+        /* Should not happen anymore */
+        printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
+               KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
+        return -1; 
+} 
+extern int agp_amd64_init(void);
+static const struct dma_mapping_ops gart_dma_ops = {
+        .mapping_error = NULL,
+        .map_single = gart_map_single,
+        .map_simple = gart_map_simple,
+        .unmap_single = gart_unmap_single,
+        .sync_single_for_cpu = NULL,
+        .sync_single_for_device = NULL,
+        .sync_single_range_for_cpu = NULL,
+        .sync_single_range_for_device = NULL,
+        .sync_sg_for_cpu = NULL,
+        .sync_sg_for_device = NULL,
+        .map_sg = gart_map_sg,
+        .unmap_sg = gart_unmap_sg,
+};
+void gart_iommu_shutdown(void)
+{
+        struct pci_dev *dev;
+        int i;
+        if (no_agp && (dma_ops != &gart_dma_ops))
+                return;
+        for (i = 0; i < num_k8_northbridges; i++) {
+                u32 ctl;
+                dev = k8_northbridges[i];
+                pci_read_config_dword(dev, 0x90, &ctl);
+                ctl &= ~1;
+                pci_write_config_dword(dev, 0x90, ctl);
+        }
+}
+void __init gart_iommu_init(void)
+{ 
+        struct agp_kern_info info;
+        unsigned long aper_size;
+        unsigned long iommu_start;
+        unsigned long scratch;
+        long i;
+        if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
+                printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
+                return;
+        }
+#ifndef CONFIG_AGP_AMD64
+        no_agp = 1; 
+#else
+        /* Makefile puts PCI initialization via subsys_initcall first. */
+        /* Add other K8 AGP bridge drivers here */
+        no_agp = no_agp || 
+                (agp_amd64_init() < 0) || 
+                (agp_copy_info(agp_bridge, &info) < 0);
+#endif  
+        if (swiotlb)
+                return;
+        /* Did we detect a different HW IOMMU? */
+        if (iommu_detected && !iommu_aperture)
+                return;
+        if (no_iommu ||
+            (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
+            !iommu_aperture ||
+            (no_agp && init_k8_gatt(&info) < 0)) {
+                if (end_pfn > MAX_DMA32_PFN) {
+                        printk(KERN_ERR "WARNING more than 4GB of memory "
+                                        "but GART IOMMU not available.\n"
+                               KERN_ERR "WARNING 32bit PCI may malfunction.\n");
+                }
+                return;
+        }
+        printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
+        aper_size = info.aper_size * 1024 * 1024;       
+        iommu_size = check_iommu_size(info.aper_base, aper_size); 
+        iommu_pages = iommu_size >> PAGE_SHIFT; 
+        iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, 
+                                                    get_order(iommu_pages/8)); 
+        if (!iommu_gart_bitmap) 
+                panic("Cannot allocate iommu bitmap\n"); 
+        memset(iommu_gart_bitmap, 0, iommu_pages/8);
+#ifdef CONFIG_IOMMU_LEAK
+        if (leak_trace) { 
+                iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 
+                                  get_order(iommu_pages*sizeof(void *)));
+                if (iommu_leak_tab) 
+                        memset(iommu_leak_tab, 0, iommu_pages * 8); 
+                else
+                        printk("PCI-DMA: Cannot allocate leak trace area\n"); 
+        } 
+#endif
+        /* 
+         * Out of IOMMU space handling.
+         * Reserve some invalid pages at the beginning of the GART. 
+         */ 
+        set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 
+        agp_memory_reserved = iommu_size;       
+        printk(KERN_INFO
+               "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
+               iommu_size>>20); 
+        iommu_start = aper_size - iommu_size;   
+        iommu_bus_base = info.aper_base + iommu_start; 
+        bad_dma_address = iommu_bus_base;
+        iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+        /* 
+         * Unmap the IOMMU part of the GART. The alias of the page is
+         * always mapped with cache enabled and there is no full cache
+         * coherency across the GART remapping. The unmapping avoids
+         * automatic prefetches from the CPU allocating cache lines in
+         * there. All CPU accesses are done via the direct mapping to
+         * the backing memory. The GART address is only used by PCI
+         * devices. 
+         */
+        clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
+        /* 
+         * Try to workaround a bug (thanks to BenH) 
+         * Set unmapped entries to a scratch page instead of 0. 
+         * Any prefetches that hit unmapped entries won't get an bus abort
+         * then.
+         */
+        scratch = get_zeroed_page(GFP_KERNEL); 
+        if (!scratch) 
+                panic("Cannot allocate iommu scratch page");
+        gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
+        for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 
+                iommu_gatt_base[i] = gart_unmapped_entry;
+        flush_gart();
+        dma_ops = &gart_dma_ops;
+} 
+void __init gart_parse_options(char *p)
+{
+        int arg;
+#ifdef CONFIG_IOMMU_LEAK
+        if (!strncmp(p,"leak",4)) {
+                leak_trace = 1;
+                p += 4;
+                if (*p == '=') ++p;
+                if (isdigit(*p) && get_option(&p, &arg))
+                        iommu_leak_pages = arg;
+        }
+#endif
+        if (isdigit(*p) && get_option(&p, &arg))
+                iommu_size = arg;
+        if (!strncmp(p, "fullflush",8))
+                iommu_fullflush = 1;
+        if (!strncmp(p, "nofullflush",11))
+                iommu_fullflush = 0;
+        if (!strncmp(p,"noagp",5))
+                no_agp = 1;
+        if (!strncmp(p, "noaperture",10))
+                fix_aperture = 0;
+        /* duplicated from pci-dma.c */
+        if (!strncmp(p,"force",5))
+                iommu_aperture_allowed = 1;
+        if (!strncmp(p,"allowed",7))
+                iommu_aperture_allowed = 1;
+        if (!strncmp(p, "memaper", 7)) {
+                fallback_aper_force = 1;
+                p += 7;
+                if (*p == '=') {
+                        ++p;
+                        if (get_option(&p, &arg))
+                                fallback_aper_order = arg;
+                }
+        }
+}
diff --git a/arch/x86/kernel/pci-nommu_64.c b/arch/x86/kernel/pci-nommu_64.c
new file mode 100644
index 000000000000..2a34c6c025a9
--- /dev/null
+++ b/arch/x86/kernel/pci-nommu_64.c
@@ -0,0 +1,97 @@
+/* Fallback functions when the main IOMMU code is not compiled in. This
+   code is roughly equivalent to i386. */
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/dma-mapping.h>
+#include <asm/iommu.h>
+#include <asm/processor.h>
+#include <asm/dma.h>
+static int
+check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
+{
+        if (hwdev && bus + size > *hwdev->dma_mask) {
+                if (*hwdev->dma_mask >= DMA_32BIT_MASK)
+                        printk(KERN_ERR
+                            "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
+                                name, (long long)bus, size,
+                                (long long)*hwdev->dma_mask);
+                return 0;
+        }
+        return 1;
+}
+static dma_addr_t
+nommu_map_single(struct device *hwdev, void *ptr, size_t size,
+               int direction)
+{
+        dma_addr_t bus = virt_to_bus(ptr);
+        if (!check_addr("map_single", hwdev, bus, size))
+                                return bad_dma_address;
+        return bus;
+}
+static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
+                        int direction)
+{
+}
+/* Map a set of buffers described by scatterlist in streaming
+ * mode for DMA.  This is the scatter-gather version of the
+ * above pci_map_single interface.  Here the scatter gather list
+ * elements are each tagged with the appropriate dma address
+ * and length.  They are obtained via sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for pci_map_single are
+ * the same here.
+ */
+static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
+               int nents, int direction)
+{
+        int i;
+        for (i = 0; i < nents; i++ ) {
+                struct scatterlist *s = &sg[i];
+                BUG_ON(!s->page);
+                s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
+                if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
+                        return 0;
+                s->dma_length = s->length;
+        }
+        return nents;
+}
+/* Unmap a set of streaming mode DMA translations.
+ * Again, cpu read rules concerning calls here are the same as for
+ * pci_unmap_single() above.
+ */
+static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
+                  int nents, int dir)
+{
+}
+const struct dma_mapping_ops nommu_dma_ops = {
+        .map_single = nommu_map_single,
+        .unmap_single = nommu_unmap_single,
+        .map_sg = nommu_map_sg,
+        .unmap_sg = nommu_unmap_sg,
+        .is_phys = 1,
+};
+void __init no_iommu_init(void)
+{
+        if (dma_ops)
+                return;
+        force_iommu = 0; /* no HW IOMMU */
+        dma_ops = &nommu_dma_ops;
+}
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
new file mode 100644
index 000000000000..b2f405ea7c85
--- /dev/null
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -0,0 +1,44 @@
+/* Glue code to lib/swiotlb.c */
+#include <linux/pci.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+#include <asm/iommu.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+int swiotlb __read_mostly;
+EXPORT_SYMBOL(swiotlb);
+const struct dma_mapping_ops swiotlb_dma_ops = {
+        .mapping_error = swiotlb_dma_mapping_error,
+        .alloc_coherent = swiotlb_alloc_coherent,
+        .free_coherent = swiotlb_free_coherent,
+        .map_single = swiotlb_map_single,
+        .unmap_single = swiotlb_unmap_single,
+        .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+        .sync_single_for_device = swiotlb_sync_single_for_device,
+        .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
+        .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
+        .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+        .sync_sg_for_device = swiotlb_sync_sg_for_device,
+        .map_sg = swiotlb_map_sg,
+        .unmap_sg = swiotlb_unmap_sg,
+        .dma_supported = NULL,
+};
+void __init pci_swiotlb_init(void)
+{
+        /* don't initialize swiotlb if iommu=off (no_iommu=1) */
+        if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)
+               swiotlb = 1;
+        if (swiotlb_force)
+                swiotlb = 1;
+        if (swiotlb) {
+                printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+                swiotlb_init();
+                dma_ops = &swiotlb_dma_ops;
+        }
+}
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
new file mode 100644
index 000000000000..bc1f2d3ea277
--- /dev/null
+++ b/arch/x86/kernel/pcspeaker.c
@@ -0,0 +1,20 @@
+#include <linux/platform_device.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+static __init int add_pcspkr(void)
+{
+        struct platform_device *pd;
+        int ret;
+        pd = platform_device_alloc("pcspkr", -1);
+        if (!pd)
+                return -ENOMEM;
+        ret = platform_device_add(pd);
+        if (ret)
+                platform_device_put(pd);
+        return ret;
+}
+device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
new file mode 100644
index 000000000000..ae8f91214f15
--- /dev/null
+++ b/arch/x86/kernel/pmtimer_64.c
@@ -0,0 +1,69 @@
+/* Ported over from i386 by AK, original copyright was:
+ *
+ * (C) Dominik Brodowski <linux@brodo.de> 2003
+ *
+ * Driver to use the Power Management Timer (PMTMR) available in some
+ * southbridges as primary timing source for the Linux kernel.
+ *
+ * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
+ * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
+ *
+ * This file is licensed under the GPL v2.
+ *
+ * Dropped all the hardware bug workarounds for now. Hopefully they
+ * are not needed on 64bit chipsets.
+ */
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/cpumask.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+#include <asm/msr.h>
+#include <asm/vsyscall.h>
+#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
+static inline u32 cyc2us(u32 cycles)
+{
+        /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
+         * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
+         *
+         * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
+         * easily be multiplied with 286 (=0x11E) without having to fear
+         * u32 overflows.
+         */
+        cycles *= 286;
+        return (cycles >> 10);
+}
+static unsigned pmtimer_wait_tick(void)
+{
+        u32 a, b;
+        for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
+             a == b;
+             b = inl(pmtmr_ioport) & ACPI_PM_MASK)
+                cpu_relax();
+        return b;
+}
+/* note: wait time is rounded up to one tick */
+void pmtimer_wait(unsigned us)
+{
+        u32 a, b;
+        a = pmtimer_wait_tick();
+        do {
+                b = inl(pmtmr_ioport);
+                cpu_relax();
+        } while (cyc2us(b - a) < us);
+}
+static int __init nopmtimer_setup(char *s)
+{
+        pmtmr_ioport = 0;
+        return 1;
+}
+__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
new file mode 100644
index 000000000000..84664710b784
--- /dev/null
+++ b/arch/x86/kernel/process_32.c
@@ -0,0 +1,951 @@
+/*
+ *  linux/arch/i386/kernel/process.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *      Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+#include <stdarg.h>
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/interrupt.h>
+#include <linux/utsname.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/ptrace.h>
+#include <linux/random.h>
+#include <linux/personality.h>
+#include <linux/tick.h>
+#include <linux/percpu.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/desc.h>
+#include <asm/vm86.h>
+#ifdef CONFIG_MATH_EMULATION
+#include <asm/math_emu.h>
+#endif
+#include <linux/err.h>
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+static int hlt_counter;
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+/*
+ * Return saved PC of a blocked thread.
+ */
+unsigned long thread_saved_pc(struct task_struct *tsk)
+{
+        return ((unsigned long *)tsk->thread.esp)[3];
+}
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
+static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
+void disable_hlt(void)
+{
+        hlt_counter++;
+}
+EXPORT_SYMBOL(disable_hlt);
+void enable_hlt(void)
+{
+        hlt_counter--;
+}
+EXPORT_SYMBOL(enable_hlt);
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+void default_idle(void)
+{
+        if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
+                current_thread_info()->status &= ~TS_POLLING;
+                /*
+                 * TS_POLLING-cleared state must be visible before we
+                 * test NEED_RESCHED:
+                 */
+                smp_mb();
+                local_irq_disable();
+                if (!need_resched())
+                        safe_halt();    /* enables interrupts racelessly */
+                else
+                        local_irq_enable();
+                current_thread_info()->status |= TS_POLLING;
+        } else {
+                /* loop is done by the caller */
+                cpu_relax();
+        }
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->work.need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle (void)
+{
+        cpu_relax();
+}
+#ifdef CONFIG_HOTPLUG_CPU
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+        /* This must be done before dead CPU ack */
+        cpu_exit_clear();
+        wbinvd();
+        mb();
+        /* Ack it */
+        __get_cpu_var(cpu_state) = CPU_DEAD;
+        /*
+         * With physical CPU hotplug, we should halt the cpu
+         */
+        local_irq_disable();
+        while (1)
+                halt();
+}
+#else
+static inline void play_dead(void)
+{
+        BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle(void)
+{
+        int cpu = smp_processor_id();
+        current_thread_info()->status |= TS_POLLING;
+        /* endless idle loop with no priority at all */
+        while (1) {
+                tick_nohz_stop_sched_tick();
+                while (!need_resched()) {
+                        void (*idle)(void);
+                        if (__get_cpu_var(cpu_idle_state))
+                                __get_cpu_var(cpu_idle_state) = 0;
+                        check_pgt_cache();
+                        rmb();
+                        idle = pm_idle;
+                        if (!idle)
+                                idle = default_idle;
+                        if (cpu_is_offline(cpu))
+                                play_dead();
+                        __get_cpu_var(irq_stat).idle_timestamp = jiffies;
+                        idle();
+                }
+                tick_nohz_restart_sched_tick();
+                preempt_enable_no_resched();
+                schedule();
+                preempt_disable();
+        }
+}
+void cpu_idle_wait(void)
+{
+        unsigned int cpu, this_cpu = get_cpu();
+        cpumask_t map, tmp = current->cpus_allowed;
+        set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
+        put_cpu();
+        cpus_clear(map);
+        for_each_online_cpu(cpu) {
+                per_cpu(cpu_idle_state, cpu) = 1;
+                cpu_set(cpu, map);
+        }
+        __get_cpu_var(cpu_idle_state) = 0;
+        wmb();
+        do {
+                ssleep(1);
+                for_each_online_cpu(cpu) {
+                        if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+                                cpu_clear(cpu, map);
+                }
+                cpus_and(map, map, cpu_online_map);
+        } while (!cpus_empty(map));
+        set_cpus_allowed(current, tmp);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+        if (!need_resched()) {
+                __monitor((void *)&current_thread_info()->flags, 0, 0);
+                smp_mb();
+                if (!need_resched())
+                        __mwait(eax, ecx);
+        }
+}
+/* Default MONITOR/MWAIT with no hints, used for default C1 state */
+static void mwait_idle(void)
+{
+        local_irq_enable();
+        mwait_idle_with_hints(0, 0);
+}
+void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
+        if (cpu_has(c, X86_FEATURE_MWAIT)) {
+                printk("monitor/mwait feature present.\n");
+                /*
+                 * Skip, if setup has overridden idle.
+                 * One CPU supports mwait => All CPUs supports mwait
+                 */
+                if (!pm_idle) {
+                        printk("using mwait in idle threads.\n");
+                        pm_idle = mwait_idle;
+                }
+        }
+}
+static int __init idle_setup(char *str)
+{
+        if (!strcmp(str, "poll")) {
+                printk("using polling idle threads.\n");
+                pm_idle = poll_idle;
+#ifdef CONFIG_X86_SMP
+                if (smp_num_siblings > 1)
+                        printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
+#endif
+        } else if (!strcmp(str, "mwait"))
+                force_mwait = 1;
+        else
+                return -1;
+        boot_option_idle_override = 1;
+        return 0;
+}
+early_param("idle", idle_setup);
+void show_regs(struct pt_regs * regs)
+{
+        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+        unsigned long d0, d1, d2, d3, d6, d7;
+        printk("\n");
+        printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
+        printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
+        print_symbol("EIP is at %s\n", regs->eip);
+        if (user_mode_vm(regs))
+                printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
+        printk(" EFLAGS: %08lx    %s  (%s %.*s)\n",
+               regs->eflags, print_tainted(), init_utsname()->release,
+               (int)strcspn(init_utsname()->version, " "),
+               init_utsname()->version);
+        printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+                regs->eax,regs->ebx,regs->ecx,regs->edx);
+        printk("ESI: %08lx EDI: %08lx EBP: %08lx",
+                regs->esi, regs->edi, regs->ebp);
+        printk(" DS: %04x ES: %04x FS: %04x\n",
+               0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);
+        cr0 = read_cr0();
+        cr2 = read_cr2();
+        cr3 = read_cr3();
+        cr4 = read_cr4_safe();
+        printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
+        get_debugreg(d0, 0);
+        get_debugreg(d1, 1);
+        get_debugreg(d2, 2);
+        get_debugreg(d3, 3);
+        printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+                        d0, d1, d2, d3);
+        get_debugreg(d6, 6);
+        get_debugreg(d7, 7);
+        printk("DR6: %08lx DR7: %08lx\n", d6, d7);
+        show_trace(NULL, regs, &regs->esp);
+}
+/*
+ * This gets run with %ebx containing the
+ * function to call, and %edx containing
+ * the "args".
+ */
+extern void kernel_thread_helper(void);
+/*
+ * Create a kernel thread
+ */
+int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+{
+        struct pt_regs regs;
+        memset(&regs, 0, sizeof(regs));
+        regs.ebx = (unsigned long) fn;
+        regs.edx = (unsigned long) arg;
+        regs.xds = __USER_DS;
+        regs.xes = __USER_DS;
+        regs.xfs = __KERNEL_PERCPU;
+        regs.orig_eax = -1;
+        regs.eip = (unsigned long) kernel_thread_helper;
+        regs.xcs = __KERNEL_CS | get_kernel_rpl();
+        regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
+        /* Ok, create the new process.. */
+        return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(kernel_thread);
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+        /* The process may have allocated an io port bitmap... nuke it. */
+        if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
+                struct task_struct *tsk = current;
+                struct thread_struct *t = &tsk->thread;
+                int cpu = get_cpu();
+                struct tss_struct *tss = &per_cpu(init_tss, cpu);
+                kfree(t->io_bitmap_ptr);
+                t->io_bitmap_ptr = NULL;
+                clear_thread_flag(TIF_IO_BITMAP);
+                /*
+                 * Careful, clear this in the TSS too:
+                 */
+                memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
+                t->io_bitmap_max = 0;
+                tss->io_bitmap_owner = NULL;
+                tss->io_bitmap_max = 0;
+                tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
+                put_cpu();
+        }
+}
+void flush_thread(void)
+{
+        struct task_struct *tsk = current;
+        memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
+        memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
+        clear_tsk_thread_flag(tsk, TIF_DEBUG);
+        /*
+         * Forget coprocessor state..
+         */
+        clear_fpu(tsk);
+        clear_used_math();
+}
+void release_thread(struct task_struct *dead_task)
+{
+        BUG_ON(dead_task->mm);
+        release_vm86_irqs(dead_task);
+}
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+        unlazy_fpu(tsk);
+}
+int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
+        unsigned long unused,
+        struct task_struct * p, struct pt_regs * regs)
+{
+        struct pt_regs * childregs;
+        struct task_struct *tsk;
+        int err;
+        childregs = task_pt_regs(p);
+        *childregs = *regs;
+        childregs->eax = 0;
+        childregs->esp = esp;
+        p->thread.esp = (unsigned long) childregs;
+        p->thread.esp0 = (unsigned long) (childregs+1);
+        p->thread.eip = (unsigned long) ret_from_fork;
+        savesegment(gs,p->thread.gs);
+        tsk = current;
+        if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
+                p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
+                                                IO_BITMAP_BYTES, GFP_KERNEL);
+                if (!p->thread.io_bitmap_ptr) {
+                        p->thread.io_bitmap_max = 0;
+                        return -ENOMEM;
+                }
+                set_tsk_thread_flag(p, TIF_IO_BITMAP);
+        }
+        /*
+         * Set a new TLS for the child thread?
+         */
+        if (clone_flags & CLONE_SETTLS) {
+                struct desc_struct *desc;
+                struct user_desc info;
+                int idx;
+                err = -EFAULT;
+                if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
+                        goto out;
+                err = -EINVAL;
+                if (LDT_empty(&info))
+                        goto out;
+                idx = info.entry_number;
+                if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+                        goto out;
+                desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+                desc->a = LDT_entry_a(&info);
+                desc->b = LDT_entry_b(&info);
+        }
+        err = 0;
+ out:
+        if (err && p->thread.io_bitmap_ptr) {
+                kfree(p->thread.io_bitmap_ptr);
+                p->thread.io_bitmap_max = 0;
+        }
+        return err;
+}
+/*
+ * fill in the user structure for a core dump..
+ */
+void dump_thread(struct pt_regs * regs, struct user * dump)
+{
+        int i;
+/* changed the size calculations - should hopefully work better. lbt */
+        dump->magic = CMAGIC;
+        dump->start_code = 0;
+        dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
+        dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
+        dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
+        dump->u_dsize -= dump->u_tsize;
+        dump->u_ssize = 0;
+        for (i = 0; i < 8; i++)
+                dump->u_debugreg[i] = current->thread.debugreg[i];  
+        if (dump->start_stack < TASK_SIZE)
+                dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
+        dump->regs.ebx = regs->ebx;
+        dump->regs.ecx = regs->ecx;
+        dump->regs.edx = regs->edx;
+        dump->regs.esi = regs->esi;
+        dump->regs.edi = regs->edi;
+        dump->regs.ebp = regs->ebp;
+        dump->regs.eax = regs->eax;
+        dump->regs.ds = regs->xds;
+        dump->regs.es = regs->xes;
+        dump->regs.fs = regs->xfs;
+        savesegment(gs,dump->regs.gs);
+        dump->regs.orig_eax = regs->orig_eax;
+        dump->regs.eip = regs->eip;
+        dump->regs.cs = regs->xcs;
+        dump->regs.eflags = regs->eflags;
+        dump->regs.esp = regs->esp;
+        dump->regs.ss = regs->xss;
+        dump->u_fpvalid = dump_fpu (regs, &dump->i387);
+}
+EXPORT_SYMBOL(dump_thread);
+/* 
+ * Capture the user space registers if the task is not running (in user space)
+ */
+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
+{
+        struct pt_regs ptregs = *task_pt_regs(tsk);
+        ptregs.xcs &= 0xffff;
+        ptregs.xds &= 0xffff;
+        ptregs.xes &= 0xffff;
+        ptregs.xss &= 0xffff;
+        elf_core_copy_regs(regs, &ptregs);
+        return 1;
+}
+#ifdef CONFIG_SECCOMP
+void hard_disable_TSC(void)
+{
+        write_cr4(read_cr4() | X86_CR4_TSD);
+}
+void disable_TSC(void)
+{
+        preempt_disable();
+        if (!test_and_set_thread_flag(TIF_NOTSC))
+                /*
+                 * Must flip the CPU state synchronously with
+                 * TIF_NOTSC in the current running context.
+                 */
+                hard_disable_TSC();
+        preempt_enable();
+}
+void hard_enable_TSC(void)
+{
+        write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+#endif /* CONFIG_SECCOMP */
+static noinline void
+__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+                 struct tss_struct *tss)
+{
+        struct thread_struct *next;
+        next = &next_p->thread;
+        if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+                set_debugreg(next->debugreg[0], 0);
+                set_debugreg(next->debugreg[1], 1);
+                set_debugreg(next->debugreg[2], 2);
+                set_debugreg(next->debugreg[3], 3);
+                /* no 4 and 5 */
+                set_debugreg(next->debugreg[6], 6);
+                set_debugreg(next->debugreg[7], 7);
+        }
+#ifdef CONFIG_SECCOMP
+        if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+            test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+                /* prev and next are different */
+                if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+                        hard_disable_TSC();
+                else
+                        hard_enable_TSC();
+        }
+#endif
+        if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+                /*
+                 * Disable the bitmap via an invalid offset. We still cache
+                 * the previous bitmap owner and the IO bitmap contents:
+                 */
+                tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
+                return;
+        }
+        if (likely(next == tss->io_bitmap_owner)) {
+                /*
+                 * Previous owner of the bitmap (hence the bitmap content)
+                 * matches the next task, we dont have to do anything but
+                 * to set a valid offset in the TSS:
+                 */
+                tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+                return;
+        }
+        /*
+         * Lazy TSS's I/O bitmap copy. We set an invalid offset here
+         * and we let the task to get a GPF in case an I/O instruction
+         * is performed.  The handler of the GPF will verify that the
+         * faulting task has a valid I/O bitmap and, it true, does the
+         * real copy and restart the instruction.  This will save us
+         * redundant copies when the currently switched task does not
+         * perform any I/O during its timeslice.
+         */
+        tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
+}
+/*
+ *      switch_to(x,yn) should switch tasks from x to y.
+ *
+ * We fsave/fwait so that an exception goes off at the right time
+ * (as a call from the fsave or fwait in effect) rather than to
+ * the wrong process. Lazy FP saving no longer makes any sense
+ * with modern CPU's, and this simplifies a lot of things (SMP
+ * and UP become the same).
+ *
+ * NOTE! We used to use the x86 hardware context switching. The
+ * reason for not using it any more becomes apparent when you
+ * try to recover gracefully from saved state that is no longer
+ * valid (stale segment register values in particular). With the
+ * hardware task-switch, there is no way to fix up bad state in
+ * a reasonable manner.
+ *
+ * The fact that Intel documents the hardware task-switching to
+ * be slow is a fairly red herring - this code is not noticeably
+ * faster. However, there _is_ some room for improvement here,
+ * so the performance issues may eventually be a valid point.
+ * More important, however, is the fact that this allows us much
+ * more flexibility.
+ *
+ * The return value (in %eax) will be the "prev" task after
+ * the task-switch, and shows up in ret_from_fork in entry.S,
+ * for example.
+ */
+struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+        struct thread_struct *prev = &prev_p->thread,
+                                 *next = &next_p->thread;
+        int cpu = smp_processor_id();
+        struct tss_struct *tss = &per_cpu(init_tss, cpu);
+        /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
+        __unlazy_fpu(prev_p);
+        /* we're going to use this soon, after a few expensive things */
+        if (next_p->fpu_counter > 5)
+                prefetch(&next->i387.fxsave);
+        /*
+         * Reload esp0.
+         */
+        load_esp0(tss, next);
+        /*
+         * Save away %gs. No need to save %fs, as it was saved on the
+         * stack on entry.  No need to save %es and %ds, as those are
+         * always kernel segments while inside the kernel.  Doing this
+         * before setting the new TLS descriptors avoids the situation
+         * where we temporarily have non-reloadable segments in %fs
+         * and %gs.  This could be an issue if the NMI handler ever
+         * used %fs or %gs (it does not today), or if the kernel is
+         * running inside of a hypervisor layer.
+         */
+        savesegment(gs, prev->gs);
+        /*
+         * Load the per-thread Thread-Local Storage descriptor.
+         */
+        load_TLS(next, cpu);
+        /*
+         * Restore IOPL if needed.  In normal use, the flags restore
+         * in the switch assembly will handle this.  But if the kernel
+         * is running virtualized at a non-zero CPL, the popf will
+         * not restore flags, so it must be done in a separate step.
+         */
+        if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
+                set_iopl_mask(next->iopl);
+        /*
+         * Now maybe handle debug registers and/or IO bitmaps
+         */
+        if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
+                     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
+                __switch_to_xtra(prev_p, next_p, tss);
+        /*
+         * Leave lazy mode, flushing any hypercalls made here.
+         * This must be done before restoring TLS segments so
+         * the GDT and LDT are properly updated, and must be
+         * done before math_state_restore, so the TS bit is up
+         * to date.
+         */
+        arch_leave_lazy_cpu_mode();
+        /* If the task has used fpu the last 5 timeslices, just do a full
+         * restore of the math state immediately to avoid the trap; the
+         * chances of needing FPU soon are obviously high now
+         */
+        if (next_p->fpu_counter > 5)
+                math_state_restore();
+        /*
+         * Restore %gs if needed (which is common)
+         */
+        if (prev->gs | next->gs)
+                loadsegment(gs, next->gs);
+        x86_write_percpu(current_task, next_p);
+        return prev_p;
+}
+asmlinkage int sys_fork(struct pt_regs regs)
+{
+        return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+}
+asmlinkage int sys_clone(struct pt_regs regs)
+{
+        unsigned long clone_flags;
+        unsigned long newsp;
+        int __user *parent_tidptr, *child_tidptr;
+        clone_flags = regs.ebx;
+        newsp = regs.ecx;
+        parent_tidptr = (int __user *)regs.edx;
+        child_tidptr = (int __user *)regs.edi;
+        if (!newsp)
+                newsp = regs.esp;
+        return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
+}
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+asmlinkage int sys_vfork(struct pt_regs regs)
+{
+        return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+}
+/*
+ * sys_execve() executes a new program.
+ */
+asmlinkage int sys_execve(struct pt_regs regs)
+{
+        int error;
+        char * filename;
+        filename = getname((char __user *) regs.ebx);
+        error = PTR_ERR(filename);
+        if (IS_ERR(filename))
+                goto out;
+        error = do_execve(filename,
+                        (char __user * __user *) regs.ecx,
+                        (char __user * __user *) regs.edx,
+                        &regs);
+        if (error == 0) {
+                task_lock(current);
+                current->ptrace &= ~PT_DTRACE;
+                task_unlock(current);
+                /* Make sure we don't return using sysenter.. */
+                set_thread_flag(TIF_IRET);
+        }
+        putname(filename);
+out:
+        return error;
+}
+#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
+#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
+unsigned long get_wchan(struct task_struct *p)
+{
+        unsigned long ebp, esp, eip;
+        unsigned long stack_page;
+        int count = 0;
+        if (!p || p == current || p->state == TASK_RUNNING)
+                return 0;
+        stack_page = (unsigned long)task_stack_page(p);
+        esp = p->thread.esp;
+        if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
+                return 0;
+        /* include/asm-i386/system.h:switch_to() pushes ebp last. */
+        ebp = *(unsigned long *) esp;
+        do {
+                if (ebp < stack_page || ebp > top_ebp+stack_page)
+                        return 0;
+                eip = *(unsigned long *) (ebp+4);
+                if (!in_sched_functions(eip))
+                        return eip;
+                ebp = *(unsigned long *) ebp;
+        } while (count++ < 16);
+        return 0;
+}
+/*
+ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
+ */
+static int get_free_idx(void)
+{
+        struct thread_struct *t = &current->thread;
+        int idx;
+        for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
+                if (desc_empty(t->tls_array + idx))
+                        return idx + GDT_ENTRY_TLS_MIN;
+        return -ESRCH;
+}
+/*
+ * Set a given TLS descriptor:
+ */
+asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
+{
+        struct thread_struct *t = &current->thread;
+        struct user_desc info;
+        struct desc_struct *desc;
+        int cpu, idx;
+        if (copy_from_user(&info, u_info, sizeof(info)))
+                return -EFAULT;
+        idx = info.entry_number;
+        /*
+         * index -1 means the kernel should try to find and
+         * allocate an empty descriptor:
+         */
+        if (idx == -1) {
+                idx = get_free_idx();
+                if (idx < 0)
+                        return idx;
+                if (put_user(idx, &u_info->entry_number))
+                        return -EFAULT;
+        }
+        if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+                return -EINVAL;
+        desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
+        /*
+         * We must not get preempted while modifying the TLS.
+         */
+        cpu = get_cpu();
+        if (LDT_empty(&info)) {
+                desc->a = 0;
+                desc->b = 0;
+        } else {
+                desc->a = LDT_entry_a(&info);
+                desc->b = LDT_entry_b(&info);
+        }
+        load_TLS(t, cpu);
+        put_cpu();
+        return 0;
+}
+/*
+ * Get the current Thread-Local Storage area:
+ */
+#define GET_BASE(desc) ( \
+        (((desc)->a >> 16) & 0x0000ffff) | \
+        (((desc)->b << 16) & 0x00ff0000) | \
+        ( (desc)->b        & 0xff000000)   )
+#define GET_LIMIT(desc) ( \
+        ((desc)->a & 0x0ffff) | \
+         ((desc)->b & 0xf0000) )
+        
+#define GET_32BIT(desc)         (((desc)->b >> 22) & 1)
+#define GET_CONTENTS(desc)      (((desc)->b >> 10) & 3)
+#define GET_WRITABLE(desc)      (((desc)->b >>  9) & 1)
+#define GET_LIMIT_PAGES(desc)   (((desc)->b >> 23) & 1)
+#define GET_PRESENT(desc)       (((desc)->b >> 15) & 1)
+#define GET_USEABLE(desc)       (((desc)->b >> 20) & 1)
+asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
+{
+        struct user_desc info;
+        struct desc_struct *desc;
+        int idx;
+        if (get_user(idx, &u_info->entry_number))
+                return -EFAULT;
+        if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+                return -EINVAL;
+        memset(&info, 0, sizeof(info));
+        desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+        info.entry_number = idx;
+        info.base_addr = GET_BASE(desc);
+        info.limit = GET_LIMIT(desc);
+        info.seg_32bit = GET_32BIT(desc);
+        info.contents = GET_CONTENTS(desc);
+        info.read_exec_only = !GET_WRITABLE(desc);
+        info.limit_in_pages = GET_LIMIT_PAGES(desc);
+        info.seg_not_present = !GET_PRESENT(desc);
+        info.useable = GET_USEABLE(desc);
+        if (copy_to_user(u_info, &info, sizeof(info)))
+                return -EFAULT;
+        return 0;
+}
+unsigned long arch_align_stack(unsigned long sp)
+{
+        if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+                sp -= get_random_int() % 8192;
+        return sp & ~0xf;
+}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
new file mode 100644
index 000000000000..98956555450b
--- /dev/null
+++ b/arch/x86/kernel/process_64.c
@@ -0,0 +1,903 @@
+/*
+ *  linux/arch/x86-64/kernel/process.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *      Gareth Hughes <gareth@valinux.com>, May 2000
+ * 
+ *  X86-64 port
+ *      Andi Kleen.
+ *
+ *      CPU hotplug support - ashok.raj@intel.com
+ */
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+#include <stdarg.h>
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/module.h>
+#include <linux/a.out.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/ptrace.h>
+#include <linux/utsname.h>
+#include <linux/random.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/mmu_context.h>
+#include <asm/pda.h>
+#include <asm/prctl.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/ia32.h>
+#include <asm/idle.h>
+asmlinkage extern void ret_from_fork(void);
+unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
+static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+void idle_notifier_register(struct notifier_block *n)
+{
+        atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+void idle_notifier_unregister(struct notifier_block *n)
+{
+        atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL(idle_notifier_unregister);
+void enter_idle(void)
+{
+        write_pda(isidle, 1);
+        atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+}
+static void __exit_idle(void)
+{
+        if (test_and_clear_bit_pda(0, isidle) == 0)
+                return;
+        atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+}
+/* Called from interrupts to signify idle end */
+void exit_idle(void)
+{
+        /* idle loop has pid 0 */
+        if (current->pid)
+                return;
+        __exit_idle();
+}
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+static void default_idle(void)
+{
+        current_thread_info()->status &= ~TS_POLLING;
+        /*
+         * TS_POLLING-cleared state must be visible before we
+         * test NEED_RESCHED:
+         */
+        smp_mb();
+        local_irq_disable();
+        if (!need_resched()) {
+                /* Enables interrupts one instruction before HLT.
+                   x86 special cases this so there is no race. */
+                safe_halt();
+        } else
+                local_irq_enable();
+        current_thread_info()->status |= TS_POLLING;
+}
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle (void)
+{
+        local_irq_enable();
+        cpu_relax();
+}
+void cpu_idle_wait(void)
+{
+        unsigned int cpu, this_cpu = get_cpu();
+        cpumask_t map, tmp = current->cpus_allowed;
+        set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
+        put_cpu();
+        cpus_clear(map);
+        for_each_online_cpu(cpu) {
+                per_cpu(cpu_idle_state, cpu) = 1;
+                cpu_set(cpu, map);
+        }
+        __get_cpu_var(cpu_idle_state) = 0;
+        wmb();
+        do {
+                ssleep(1);
+                for_each_online_cpu(cpu) {
+                        if (cpu_isset(cpu, map) &&
+                                        !per_cpu(cpu_idle_state, cpu))
+                                cpu_clear(cpu, map);
+                }
+                cpus_and(map, map, cpu_online_map);
+        } while (!cpus_empty(map));
+        set_cpus_allowed(current, tmp);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+#ifdef CONFIG_HOTPLUG_CPU
+DECLARE_PER_CPU(int, cpu_state);
+#include <asm/nmi.h>
+/* We halt the CPU with physical CPU hotplug */
+static inline void play_dead(void)
+{
+        idle_task_exit();
+        wbinvd();
+        mb();
+        /* Ack it */
+        __get_cpu_var(cpu_state) = CPU_DEAD;
+        local_irq_disable();
+        while (1)
+                halt();
+}
+#else
+static inline void play_dead(void)
+{
+        BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle (void)
+{
+        current_thread_info()->status |= TS_POLLING;
+        /* endless idle loop with no priority at all */
+        while (1) {
+                while (!need_resched()) {
+                        void (*idle)(void);
+                        if (__get_cpu_var(cpu_idle_state))
+                                __get_cpu_var(cpu_idle_state) = 0;
+                        rmb();
+                        idle = pm_idle;
+                        if (!idle)
+                                idle = default_idle;
+                        if (cpu_is_offline(smp_processor_id()))
+                                play_dead();
+                        /*
+                         * Idle routines should keep interrupts disabled
+                         * from here on, until they go to idle.
+                         * Otherwise, idle callbacks can misfire.
+                         */
+                        local_irq_disable();
+                        enter_idle();
+                        idle();
+                        /* In many cases the interrupt that ended idle
+                           has already called exit_idle. But some idle
+                           loops can be woken up without interrupt. */
+                        __exit_idle();
+                }
+                preempt_enable_no_resched();
+                schedule();
+                preempt_disable();
+        }
+}
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+        if (!need_resched()) {
+                __monitor((void *)&current_thread_info()->flags, 0, 0);
+                smp_mb();
+                if (!need_resched())
+                        __mwait(eax, ecx);
+        }
+}
+/* Default MONITOR/MWAIT with no hints, used for default C1 state */
+static void mwait_idle(void)
+{
+        if (!need_resched()) {
+                __monitor((void *)&current_thread_info()->flags, 0, 0);
+                smp_mb();
+                if (!need_resched())
+                        __sti_mwait(0, 0);
+                else
+                        local_irq_enable();
+        } else {
+                local_irq_enable();
+        }
+}
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
+        static int printed;
+        if (cpu_has(c, X86_FEATURE_MWAIT)) {
+                /*
+                 * Skip, if setup has overridden idle.
+                 * One CPU supports mwait => All CPUs supports mwait
+                 */
+                if (!pm_idle) {
+                        if (!printed) {
+                                printk(KERN_INFO "using mwait in idle threads.\n");
+                                printed = 1;
+                        }
+                        pm_idle = mwait_idle;
+                }
+        }
+}
+static int __init idle_setup (char *str)
+{
+        if (!strcmp(str, "poll")) {
+                printk("using polling idle threads.\n");
+                pm_idle = poll_idle;
+        } else if (!strcmp(str, "mwait"))
+                force_mwait = 1;
+        else
+                return -1;
+        boot_option_idle_override = 1;
+        return 0;
+}
+early_param("idle", idle_setup);
+/* Prints also some state that isn't saved in the pt_regs */ 
+void __show_regs(struct pt_regs * regs)
+{
+        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+        unsigned long d0, d1, d2, d3, d6, d7;
+        unsigned int fsindex,gsindex;
+        unsigned int ds,cs,es; 
+        printk("\n");
+        print_modules();
+        printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+                current->pid, current->comm, print_tainted(),
+                init_utsname()->release,
+                (int)strcspn(init_utsname()->version, " "),
+                init_utsname()->version);
+        printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
+        printk_address(regs->rip); 
+        printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
+                regs->eflags);
+        printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+               regs->rax, regs->rbx, regs->rcx);
+        printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
+               regs->rdx, regs->rsi, regs->rdi); 
+        printk("RBP: %016lx R08: %016lx R09: %016lx\n",
+               regs->rbp, regs->r8, regs->r9); 
+        printk("R10: %016lx R11: %016lx R12: %016lx\n",
+               regs->r10, regs->r11, regs->r12); 
+        printk("R13: %016lx R14: %016lx R15: %016lx\n",
+               regs->r13, regs->r14, regs->r15); 
+        asm("movl %%ds,%0" : "=r" (ds)); 
+        asm("movl %%cs,%0" : "=r" (cs)); 
+        asm("movl %%es,%0" : "=r" (es)); 
+        asm("movl %%fs,%0" : "=r" (fsindex));
+        asm("movl %%gs,%0" : "=r" (gsindex));
+        rdmsrl(MSR_FS_BASE, fs);
+        rdmsrl(MSR_GS_BASE, gs); 
+        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 
+        cr0 = read_cr0();
+        cr2 = read_cr2();
+        cr3 = read_cr3();
+        cr4 = read_cr4();
+        printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 
+               fs,fsindex,gs,gsindex,shadowgs); 
+        printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 
+        printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
+        get_debugreg(d0, 0);
+        get_debugreg(d1, 1);
+        get_debugreg(d2, 2);
+        printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+        get_debugreg(d3, 3);
+        get_debugreg(d6, 6);
+        get_debugreg(d7, 7);
+        printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+}
+void show_regs(struct pt_regs *regs)
+{
+        printk("CPU %d:", smp_processor_id());
+        __show_regs(regs);
+        show_trace(NULL, regs, (void *)(regs + 1));
+}
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+        struct task_struct *me = current;
+        struct thread_struct *t = &me->thread;
+        if (me->thread.io_bitmap_ptr) { 
+                struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+                kfree(t->io_bitmap_ptr);
+                t->io_bitmap_ptr = NULL;
+                clear_thread_flag(TIF_IO_BITMAP);
+                /*
+                 * Careful, clear this in the TSS too:
+                 */
+                memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
+                t->io_bitmap_max = 0;
+                put_cpu();
+        }
+}
+void flush_thread(void)
+{
+        struct task_struct *tsk = current;
+        if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
+                clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
+                if (test_tsk_thread_flag(tsk, TIF_IA32)) {
+                        clear_tsk_thread_flag(tsk, TIF_IA32);
+                } else {
+                        set_tsk_thread_flag(tsk, TIF_IA32);
+                        current_thread_info()->status |= TS_COMPAT;
+                }
+        }
+        clear_tsk_thread_flag(tsk, TIF_DEBUG);
+        tsk->thread.debugreg0 = 0;
+        tsk->thread.debugreg1 = 0;
+        tsk->thread.debugreg2 = 0;
+        tsk->thread.debugreg3 = 0;
+        tsk->thread.debugreg6 = 0;
+        tsk->thread.debugreg7 = 0;
+        memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
+        /*
+         * Forget coprocessor state..
+         */
+        clear_fpu(tsk);
+        clear_used_math();
+}
+void release_thread(struct task_struct *dead_task)
+{
+        if (dead_task->mm) {
+                if (dead_task->mm->context.size) {
+                        printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
+                                        dead_task->comm,
+                                        dead_task->mm->context.ldt,
+                                        dead_task->mm->context.size);
+                        BUG();
+                }
+        }
+}
+static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
+{
+        struct user_desc ud = { 
+                .base_addr = addr,
+                .limit = 0xfffff,
+                .seg_32bit = 1,
+                .limit_in_pages = 1,
+                .useable = 1,
+        };
+        struct n_desc_struct *desc = (void *)t->thread.tls_array;
+        desc += tls;
+        desc->a = LDT_entry_a(&ud); 
+        desc->b = LDT_entry_b(&ud); 
+}
+static inline u32 read_32bit_tls(struct task_struct *t, int tls)
+{
+        struct desc_struct *desc = (void *)t->thread.tls_array;
+        desc += tls;
+        return desc->base0 | 
+                (((u32)desc->base1) << 16) | 
+                (((u32)desc->base2) << 24);
+}
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+        unlazy_fpu(tsk);
+}
+int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 
+                unsigned long unused,
+        struct task_struct * p, struct pt_regs * regs)
+{
+        int err;
+        struct pt_regs * childregs;
+        struct task_struct *me = current;
+        childregs = ((struct pt_regs *)
+                        (THREAD_SIZE + task_stack_page(p))) - 1;
+        *childregs = *regs;
+        childregs->rax = 0;
+        childregs->rsp = rsp;
+        if (rsp == ~0UL)
+                childregs->rsp = (unsigned long)childregs;
+        p->thread.rsp = (unsigned long) childregs;
+        p->thread.rsp0 = (unsigned long) (childregs+1);
+        p->thread.userrsp = me->thread.userrsp; 
+        set_tsk_thread_flag(p, TIF_FORK);
+        p->thread.fs = me->thread.fs;
+        p->thread.gs = me->thread.gs;
+        asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
+        asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
+        asm("mov %%es,%0" : "=m" (p->thread.es));
+        asm("mov %%ds,%0" : "=m" (p->thread.ds));
+        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
+                p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+                if (!p->thread.io_bitmap_ptr) {
+                        p->thread.io_bitmap_max = 0;
+                        return -ENOMEM;
+                }
+                memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
+                                IO_BITMAP_BYTES);
+                set_tsk_thread_flag(p, TIF_IO_BITMAP);
+        } 
+        /*
+         * Set a new TLS for the child thread?
+         */
+        if (clone_flags & CLONE_SETTLS) {
+#ifdef CONFIG_IA32_EMULATION
+                if (test_thread_flag(TIF_IA32))
+                        err = ia32_child_tls(p, childregs); 
+                else                    
+#endif   
+                        err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
+                if (err) 
+                        goto out;
+        }
+        err = 0;
+out:
+        if (err && p->thread.io_bitmap_ptr) {
+                kfree(p->thread.io_bitmap_ptr);
+                p->thread.io_bitmap_max = 0;
+        }
+        return err;
+}
+/*
+ * This special macro can be used to load a debugging register
+ */
+#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
+static inline void __switch_to_xtra(struct task_struct *prev_p,
+                                    struct task_struct *next_p,
+                                    struct tss_struct *tss)
+{
+        struct thread_struct *prev, *next;
+        prev = &prev_p->thread,
+        next = &next_p->thread;
+        if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+                loaddebug(next, 0);
+                loaddebug(next, 1);
+                loaddebug(next, 2);
+                loaddebug(next, 3);
+                /* no 4 and 5 */
+                loaddebug(next, 6);
+                loaddebug(next, 7);
+        }
+        if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+                /*
+                 * Copy the relevant range of the IO bitmap.
+                 * Normally this is 128 bytes or less:
+                 */
+                memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+                       max(prev->io_bitmap_max, next->io_bitmap_max));
+        } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
+                /*
+                 * Clear any possible leftover bits:
+                 */
+                memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+        }
+}
+/*
+ *      switch_to(x,y) should switch tasks from x to y.
+ *
+ * This could still be optimized: 
+ * - fold all the options into a flag word and test it with a single test.
+ * - could test fs/gs bitsliced
+ *
+ * Kprobes not supported here. Set the probe on schedule instead.
+ */
+__kprobes struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+        struct thread_struct *prev = &prev_p->thread,
+                                 *next = &next_p->thread;
+        int cpu = smp_processor_id();  
+        struct tss_struct *tss = &per_cpu(init_tss, cpu);
+        /* we're going to use this soon, after a few expensive things */
+        if (next_p->fpu_counter>5)
+                prefetch(&next->i387.fxsave);
+        /*
+         * Reload esp0, LDT and the page table pointer:
+         */
+        tss->rsp0 = next->rsp0;
+        /* 
+         * Switch DS and ES.
+         * This won't pick up thread selector changes, but I guess that is ok.
+         */
+        asm volatile("mov %%es,%0" : "=m" (prev->es));
+        if (unlikely(next->es | prev->es))
+                loadsegment(es, next->es); 
+        
+        asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
+        if (unlikely(next->ds | prev->ds))
+                loadsegment(ds, next->ds);
+        load_TLS(next, cpu);
+        /* 
+         * Switch FS and GS.
+         */
+        { 
+                unsigned fsindex;
+                asm volatile("movl %%fs,%0" : "=r" (fsindex)); 
+                /* segment register != 0 always requires a reload. 
+                   also reload when it has changed. 
+                   when prev process used 64bit base always reload
+                   to avoid an information leak. */
+                if (unlikely(fsindex | next->fsindex | prev->fs)) {
+                        loadsegment(fs, next->fsindex);
+                        /* check if the user used a selector != 0
+                         * if yes clear 64bit base, since overloaded base
+                         * is always mapped to the Null selector
+                         */
+                        if (fsindex)
+                        prev->fs = 0;                           
+                }
+                /* when next process has a 64bit base use it */
+                if (next->fs) 
+                        wrmsrl(MSR_FS_BASE, next->fs); 
+                prev->fsindex = fsindex;
+        }
+        { 
+                unsigned gsindex;
+                asm volatile("movl %%gs,%0" : "=r" (gsindex)); 
+                if (unlikely(gsindex | next->gsindex | prev->gs)) {
+                        load_gs_index(next->gsindex);
+                        if (gsindex)
+                        prev->gs = 0;                           
+                }
+                if (next->gs)
+                        wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 
+                prev->gsindex = gsindex;
+        }
+        /* Must be after DS reload */
+        unlazy_fpu(prev_p);
+        /* 
+         * Switch the PDA and FPU contexts.
+         */
+        prev->userrsp = read_pda(oldrsp); 
+        write_pda(oldrsp, next->userrsp); 
+        write_pda(pcurrent, next_p); 
+        write_pda(kernelstack,
+        (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+#ifdef CONFIG_CC_STACKPROTECTOR
+        write_pda(stack_canary, next_p->stack_canary);
+        /*
+         * Build time only check to make sure the stack_canary is at
+         * offset 40 in the pda; this is a gcc ABI requirement
+         */
+        BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
+#endif
+        /*
+         * Now maybe reload the debug registers and handle I/O bitmaps
+         */
+        if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
+            || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
+                __switch_to_xtra(prev_p, next_p, tss);
+        /* If the task has used fpu the last 5 timeslices, just do a full
+         * restore of the math state immediately to avoid the trap; the
+         * chances of needing FPU soon are obviously high now
+         */
+        if (next_p->fpu_counter>5)
+                math_state_restore();
+        return prev_p;
+}
+/*
+ * sys_execve() executes a new program.
+ */
+asmlinkage 
+long sys_execve(char __user *name, char __user * __user *argv,
+                char __user * __user *envp, struct pt_regs regs)
+{
+        long error;
+        char * filename;
+        filename = getname(name);
+        error = PTR_ERR(filename);
+        if (IS_ERR(filename)) 
+                return error;
+        error = do_execve(filename, argv, envp, &regs); 
+        if (error == 0) {
+                task_lock(current);
+                current->ptrace &= ~PT_DTRACE;
+                task_unlock(current);
+        }
+        putname(filename);
+        return error;
+}
+void set_personality_64bit(void)
+{
+        /* inherit personality from parent */
+        /* Make sure to be in 64bit mode */
+        clear_thread_flag(TIF_IA32); 
+        /* TBD: overwrites user setup. Should have two bits.
+           But 64bit processes have always behaved this way,
+           so it's not too bad. The main problem is just that
+           32bit childs are affected again. */
+        current->personality &= ~READ_IMPLIES_EXEC;
+}
+asmlinkage long sys_fork(struct pt_regs *regs)
+{
+        return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
+}
+asmlinkage long
+sys_clone(unsigned long clone_flags, unsigned long newsp,
+          void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+{
+        if (!newsp)
+                newsp = regs->rsp;
+        return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+}
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+asmlinkage long sys_vfork(struct pt_regs *regs)
+{
+        return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
+                    NULL, NULL);
+}
+unsigned long get_wchan(struct task_struct *p)
+{
+        unsigned long stack;
+        u64 fp,rip;
+        int count = 0;
+        if (!p || p == current || p->state==TASK_RUNNING)
+                return 0; 
+        stack = (unsigned long)task_stack_page(p);
+        if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
+                return 0;
+        fp = *(u64 *)(p->thread.rsp);
+        do { 
+                if (fp < (unsigned long)stack ||
+                    fp > (unsigned long)stack+THREAD_SIZE)
+                        return 0; 
+                rip = *(u64 *)(fp+8); 
+                if (!in_sched_functions(rip))
+                        return rip; 
+                fp = *(u64 *)fp; 
+        } while (count++ < 16); 
+        return 0;
+}
+long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
+{ 
+        int ret = 0; 
+        int doit = task == current;
+        int cpu;
+        switch (code) { 
+        case ARCH_SET_GS:
+                if (addr >= TASK_SIZE_OF(task))
+                        return -EPERM; 
+                cpu = get_cpu();
+                /* handle small bases via the GDT because that's faster to 
+                   switch. */
+                if (addr <= 0xffffffff) {  
+                        set_32bit_tls(task, GS_TLS, addr); 
+                        if (doit) { 
+                                load_TLS(&task->thread, cpu);
+                                load_gs_index(GS_TLS_SEL); 
+                        }
+                        task->thread.gsindex = GS_TLS_SEL; 
+                        task->thread.gs = 0;
+                } else { 
+                        task->thread.gsindex = 0;
+                        task->thread.gs = addr;
+                        if (doit) {
+                                load_gs_index(0);
+                                ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
+                        } 
+                }
+                put_cpu();
+                break;
+        case ARCH_SET_FS:
+                /* Not strictly needed for fs, but do it for symmetry
+                   with gs */
+                if (addr >= TASK_SIZE_OF(task))
+                        return -EPERM; 
+                cpu = get_cpu();
+                /* handle small bases via the GDT because that's faster to 
+                   switch. */
+                if (addr <= 0xffffffff) { 
+                        set_32bit_tls(task, FS_TLS, addr);
+                        if (doit) { 
+                                load_TLS(&task->thread, cpu); 
+                                asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
+                        }
+                        task->thread.fsindex = FS_TLS_SEL;
+                        task->thread.fs = 0;
+                } else { 
+                        task->thread.fsindex = 0;
+                        task->thread.fs = addr;
+                        if (doit) {
+                                /* set the selector to 0 to not confuse
+                                   __switch_to */
+                                asm volatile("movl %0,%%fs" :: "r" (0));
+                                ret = checking_wrmsrl(MSR_FS_BASE, addr);
+                        }
+                }
+                put_cpu();
+                break;
+        case ARCH_GET_FS: { 
+                unsigned long base; 
+                if (task->thread.fsindex == FS_TLS_SEL)
+                        base = read_32bit_tls(task, FS_TLS);
+                else if (doit)
+                        rdmsrl(MSR_FS_BASE, base);
+                else
+                        base = task->thread.fs;
+                ret = put_user(base, (unsigned long __user *)addr); 
+                break; 
+        }
+        case ARCH_GET_GS: { 
+                unsigned long base;
+                unsigned gsindex;
+                if (task->thread.gsindex == GS_TLS_SEL)
+                        base = read_32bit_tls(task, GS_TLS);
+                else if (doit) {
+                        asm("movl %%gs,%0" : "=r" (gsindex));
+                        if (gsindex)
+                                rdmsrl(MSR_KERNEL_GS_BASE, base);
+                        else
+                                base = task->thread.gs;
+                }
+                else
+                        base = task->thread.gs;
+                ret = put_user(base, (unsigned long __user *)addr); 
+                break;
+        }
+        default:
+                ret = -EINVAL;
+                break;
+        } 
+        return ret;     
+} 
+long sys_arch_prctl(int code, unsigned long addr)
+{
+        return do_arch_prctl(current, code, addr);
+} 
+/* 
+ * Capture the user space registers if the task is not running (in user space)
+ */
+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
+{
+        struct pt_regs *pp, ptregs;
+        pp = task_pt_regs(tsk);
+        ptregs = *pp; 
+        ptregs.cs &= 0xffff;
+        ptregs.ss &= 0xffff;
+        elf_core_copy_regs(regs, &ptregs);
+ 
+        return 1;
+}
+unsigned long arch_align_stack(unsigned long sp)
+{
+        if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+                sp -= get_random_int() % 8192;
+        return sp & ~0xf;
+}
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
new file mode 100644
index 000000000000..7c1b92522e95
--- /dev/null
+++ b/arch/x86/kernel/ptrace_32.c
@@ -0,0 +1,723 @@
+/* ptrace.c */
+/* By Ross Biro 1/23/92 */
+/*
+ * Pentium III FXSR, SSE support
+ *      Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/security.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+/*
+ * Determines which flags the user has access to [1 = access, 0 = no access].
+ * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
+ * Also masks reserved bits (31-22, 15, 5, 3, 1).
+ */
+#define FLAG_MASK 0x00050dd5
+/* set's the trap flag. */
+#define TRAP_FLAG 0x100
+/*
+ * Offset of eflags on child stack..
+ */
+#define EFL_OFFSET offsetof(struct pt_regs, eflags)
+static inline struct pt_regs *get_child_regs(struct task_struct *task)
+{
+        void *stack_top = (void *)task->thread.esp0;
+        return stack_top - sizeof(struct pt_regs);
+}
+/*
+ * This routine will get a word off of the processes privileged stack.
+ * the offset is bytes into the pt_regs structure on the stack.
+ * This routine assumes that all the privileged stacks are in our
+ * data space.
+ */   
+static inline int get_stack_long(struct task_struct *task, int offset)
+{
+        unsigned char *stack;
+        stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
+        stack += offset;
+        return (*((int *)stack));
+}
+/*
+ * This routine will put a word on the processes privileged stack.
+ * the offset is bytes into the pt_regs structure on the stack.
+ * This routine assumes that all the privileged stacks are in our
+ * data space.
+ */
+static inline int put_stack_long(struct task_struct *task, int offset,
+        unsigned long data)
+{
+        unsigned char * stack;
+        stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
+        stack += offset;
+        *(unsigned long *) stack = data;
+        return 0;
+}
+static int putreg(struct task_struct *child,
+        unsigned long regno, unsigned long value)
+{
+        switch (regno >> 2) {
+                case GS:
+                        if (value && (value & 3) != 3)
+                                return -EIO;
+                        child->thread.gs = value;
+                        return 0;
+                case DS:
+                case ES:
+                case FS:
+                        if (value && (value & 3) != 3)
+                                return -EIO;
+                        value &= 0xffff;
+                        break;
+                case SS:
+                case CS:
+                        if ((value & 3) != 3)
+                                return -EIO;
+                        value &= 0xffff;
+                        break;
+                case EFL:
+                        value &= FLAG_MASK;
+                        value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
+                        break;
+        }
+        if (regno > FS*4)
+                regno -= 1*4;
+        put_stack_long(child, regno, value);
+        return 0;
+}
+static unsigned long getreg(struct task_struct *child,
+        unsigned long regno)
+{
+        unsigned long retval = ~0UL;
+        switch (regno >> 2) {
+                case GS:
+                        retval = child->thread.gs;
+                        break;
+                case DS:
+                case ES:
+                case FS:
+                case SS:
+                case CS:
+                        retval = 0xffff;
+                        /* fall through */
+                default:
+                        if (regno > FS*4)
+                                regno -= 1*4;
+                        retval &= get_stack_long(child, regno);
+        }
+        return retval;
+}
+#define LDT_SEGMENT 4
+static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs)
+{
+        unsigned long addr, seg;
+        addr = regs->eip;
+        seg = regs->xcs & 0xffff;
+        if (regs->eflags & VM_MASK) {
+                addr = (addr & 0xffff) + (seg << 4);
+                return addr;
+        }
+        /*
+         * We'll assume that the code segments in the GDT
+         * are all zero-based. That is largely true: the
+         * TLS segments are used for data, and the PNPBIOS
+         * and APM bios ones we just ignore here.
+         */
+        if (seg & LDT_SEGMENT) {
+                u32 *desc;
+                unsigned long base;
+                seg &= ~7UL;
+                down(&child->mm->context.sem);
+                if (unlikely((seg >> 3) >= child->mm->context.size))
+                        addr = -1L; /* bogus selector, access would fault */
+                else {
+                        desc = child->mm->context.ldt + seg;
+                        base = ((desc[0] >> 16) |
+                                ((desc[1] & 0xff) << 16) |
+                                (desc[1] & 0xff000000));
+                        /* 16-bit code segment? */
+                        if (!((desc[1] >> 22) & 1))
+                                addr &= 0xffff;
+                        addr += base;
+                }
+                up(&child->mm->context.sem);
+        }
+        return addr;
+}
+static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
+{
+        int i, copied;
+        unsigned char opcode[15];
+        unsigned long addr = convert_eip_to_linear(child, regs);
+        copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
+        for (i = 0; i < copied; i++) {
+                switch (opcode[i]) {
+                /* popf and iret */
+                case 0x9d: case 0xcf:
+                        return 1;
+                /* opcode and address size prefixes */
+                case 0x66: case 0x67:
+                        continue;
+                /* irrelevant prefixes (segment overrides and repeats) */
+                case 0x26: case 0x2e:
+                case 0x36: case 0x3e:
+                case 0x64: case 0x65:
+                case 0xf0: case 0xf2: case 0xf3:
+                        continue;
+                /*
+                 * pushf: NOTE! We should probably not let
+                 * the user see the TF bit being set. But
+                 * it's more pain than it's worth to avoid
+                 * it, and a debugger could emulate this
+                 * all in user space if it _really_ cares.
+                 */
+                case 0x9c:
+                default:
+                        return 0;
+                }
+        }
+        return 0;
+}
+static void set_singlestep(struct task_struct *child)
+{
+        struct pt_regs *regs = get_child_regs(child);
+        /*
+         * Always set TIF_SINGLESTEP - this guarantees that 
+         * we single-step system calls etc..  This will also
+         * cause us to set TF when returning to user mode.
+         */
+        set_tsk_thread_flag(child, TIF_SINGLESTEP);
+        /*
+         * If TF was already set, don't do anything else
+         */
+        if (regs->eflags & TRAP_FLAG)
+                return;
+        /* Set TF on the kernel stack.. */
+        regs->eflags |= TRAP_FLAG;
+        /*
+         * ..but if TF is changed by the instruction we will trace,
+         * don't mark it as being "us" that set it, so that we
+         * won't clear it by hand later.
+         */
+        if (is_setting_trap_flag(child, regs))
+                return;
+        
+        child->ptrace |= PT_DTRACE;
+}
+static void clear_singlestep(struct task_struct *child)
+{
+        /* Always clear TIF_SINGLESTEP... */
+        clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+        /* But touch TF only if it was set by us.. */
+        if (child->ptrace & PT_DTRACE) {
+                struct pt_regs *regs = get_child_regs(child);
+                regs->eflags &= ~TRAP_FLAG;
+                child->ptrace &= ~PT_DTRACE;
+        }
+}
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure the single step bit is not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{ 
+        clear_singlestep(child);
+        clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+}
+/*
+ * Perform get_thread_area on behalf of the traced child.
+ */
+static int
+ptrace_get_thread_area(struct task_struct *child,
+                       int idx, struct user_desc __user *user_desc)
+{
+        struct user_desc info;
+        struct desc_struct *desc;
+/*
+ * Get the current Thread-Local Storage area:
+ */
+#define GET_BASE(desc) ( \
+        (((desc)->a >> 16) & 0x0000ffff) | \
+        (((desc)->b << 16) & 0x00ff0000) | \
+        ( (desc)->b        & 0xff000000)   )
+#define GET_LIMIT(desc) ( \
+        ((desc)->a & 0x0ffff) | \
+         ((desc)->b & 0xf0000) )
+#define GET_32BIT(desc)         (((desc)->b >> 22) & 1)
+#define GET_CONTENTS(desc)      (((desc)->b >> 10) & 3)
+#define GET_WRITABLE(desc)      (((desc)->b >>  9) & 1)
+#define GET_LIMIT_PAGES(desc)   (((desc)->b >> 23) & 1)
+#define GET_PRESENT(desc)       (((desc)->b >> 15) & 1)
+#define GET_USEABLE(desc)       (((desc)->b >> 20) & 1)
+        if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+                return -EINVAL;
+        desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+        info.entry_number = idx;
+        info.base_addr = GET_BASE(desc);
+        info.limit = GET_LIMIT(desc);
+        info.seg_32bit = GET_32BIT(desc);
+        info.contents = GET_CONTENTS(desc);
+        info.read_exec_only = !GET_WRITABLE(desc);
+        info.limit_in_pages = GET_LIMIT_PAGES(desc);
+        info.seg_not_present = !GET_PRESENT(desc);
+        info.useable = GET_USEABLE(desc);
+        if (copy_to_user(user_desc, &info, sizeof(info)))
+                return -EFAULT;
+        return 0;
+}
+/*
+ * Perform set_thread_area on behalf of the traced child.
+ */
+static int
+ptrace_set_thread_area(struct task_struct *child,
+                       int idx, struct user_desc __user *user_desc)
+{
+        struct user_desc info;
+        struct desc_struct *desc;
+        if (copy_from_user(&info, user_desc, sizeof(info)))
+                return -EFAULT;
+        if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+                return -EINVAL;
+        desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+        if (LDT_empty(&info)) {
+                desc->a = 0;
+                desc->b = 0;
+        } else {
+                desc->a = LDT_entry_a(&info);
+                desc->b = LDT_entry_b(&info);
+        }
+        return 0;
+}
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+{
+        struct user * dummy = NULL;
+        int i, ret;
+        unsigned long __user *datap = (unsigned long __user *)data;
+        switch (request) {
+        /* when I and D space are separate, these will need to be fixed. */
+        case PTRACE_PEEKTEXT: /* read word at location addr. */ 
+        case PTRACE_PEEKDATA:
+                ret = generic_ptrace_peekdata(child, addr, data);
+                break;
+        /* read the word at location addr in the USER area. */
+        case PTRACE_PEEKUSR: {
+                unsigned long tmp;
+                ret = -EIO;
+                if ((addr & 3) || addr < 0 || 
+                    addr > sizeof(struct user) - 3)
+                        break;
+                tmp = 0;  /* Default return condition */
+                if(addr < FRAME_SIZE*sizeof(long))
+                        tmp = getreg(child, addr);
+                if(addr >= (long) &dummy->u_debugreg[0] &&
+                   addr <= (long) &dummy->u_debugreg[7]){
+                        addr -= (long) &dummy->u_debugreg[0];
+                        addr = addr >> 2;
+                        tmp = child->thread.debugreg[addr];
+                }
+                ret = put_user(tmp, datap);
+                break;
+        }
+        /* when I and D space are separate, this will have to be fixed. */
+        case PTRACE_POKETEXT: /* write the word at location addr. */
+        case PTRACE_POKEDATA:
+                ret = generic_ptrace_pokedata(child, addr, data);
+                break;
+        case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
+                ret = -EIO;
+                if ((addr & 3) || addr < 0 || 
+                    addr > sizeof(struct user) - 3)
+                        break;
+                if (addr < FRAME_SIZE*sizeof(long)) {
+                        ret = putreg(child, addr, data);
+                        break;
+                }
+                /* We need to be very careful here.  We implicitly
+                   want to modify a portion of the task_struct, and we
+                   have to be selective about what portions we allow someone
+                   to modify. */
+                  ret = -EIO;
+                  if(addr >= (long) &dummy->u_debugreg[0] &&
+                     addr <= (long) &dummy->u_debugreg[7]){
+                          if(addr == (long) &dummy->u_debugreg[4]) break;
+                          if(addr == (long) &dummy->u_debugreg[5]) break;
+                          if(addr < (long) &dummy->u_debugreg[4] &&
+                             ((unsigned long) data) >= TASK_SIZE-3) break;
+                          
+                          /* Sanity-check data. Take one half-byte at once with
+                           * check = (val >> (16 + 4*i)) & 0xf. It contains the
+                           * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
+                           * 2 and 3 are LENi. Given a list of invalid values,
+                           * we do mask |= 1 << invalid_value, so that
+                           * (mask >> check) & 1 is a correct test for invalid
+                           * values.
+                           *
+                           * R/Wi contains the type of the breakpoint /
+                           * watchpoint, LENi contains the length of the watched
+                           * data in the watchpoint case.
+                           *
+                           * The invalid values are:
+                           * - LENi == 0x10 (undefined), so mask |= 0x0f00.
+                           * - R/Wi == 0x10 (break on I/O reads or writes), so
+                           *   mask |= 0x4444.
+                           * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
+                           *   0x1110.
+                           *
+                           * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
+                           *
+                           * See the Intel Manual "System Programming Guide",
+                           * 15.2.4
+                           *
+                           * Note that LENi == 0x10 is defined on x86_64 in long
+                           * mode (i.e. even for 32-bit userspace software, but
+                           * 64-bit kernel), so the x86_64 mask value is 0x5454.
+                           * See the AMD manual no. 24593 (AMD64 System
+                           * Programming)*/
+                          if(addr == (long) &dummy->u_debugreg[7]) {
+                                  data &= ~DR_CONTROL_RESERVED;
+                                  for(i=0; i<4; i++)
+                                          if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
+                                                  goto out_tsk;
+                                  if (data)
+                                          set_tsk_thread_flag(child, TIF_DEBUG);
+                                  else
+                                          clear_tsk_thread_flag(child, TIF_DEBUG);
+                          }
+                          addr -= (long) &dummy->u_debugreg;
+                          addr = addr >> 2;
+                          child->thread.debugreg[addr] = data;
+                          ret = 0;
+                  }
+                  break;
+        case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
+        case PTRACE_SYSCALL:    /* continue and stop at next (return from) syscall */
+        case PTRACE_CONT:       /* restart after signal. */
+                ret = -EIO;
+                if (!valid_signal(data))
+                        break;
+                if (request == PTRACE_SYSEMU) {
+                        set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+                        clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+                } else if (request == PTRACE_SYSCALL) {
+                        set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+                        clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+                } else {
+                        clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+                        clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+                }
+                child->exit_code = data;
+                /* make sure the single step bit is not set. */
+                clear_singlestep(child);
+                wake_up_process(child);
+                ret = 0;
+                break;
+/*
+ * make the child exit.  Best I can do is send it a sigkill. 
+ * perhaps it should be put in the status that it wants to 
+ * exit.
+ */
+        case PTRACE_KILL:
+                ret = 0;
+                if (child->exit_state == EXIT_ZOMBIE)   /* already dead */
+                        break;
+                child->exit_code = SIGKILL;
+                /* make sure the single step bit is not set. */
+                clear_singlestep(child);
+                wake_up_process(child);
+                break;
+        case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
+        case PTRACE_SINGLESTEP: /* set the trap flag. */
+                ret = -EIO;
+                if (!valid_signal(data))
+                        break;
+                if (request == PTRACE_SYSEMU_SINGLESTEP)
+                        set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+                else
+                        clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+                clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+                set_singlestep(child);
+                child->exit_code = data;
+                /* give it a chance to run. */
+                wake_up_process(child);
+                ret = 0;
+                break;
+        case PTRACE_DETACH:
+                /* detach a process that was attached. */
+                ret = ptrace_detach(child, data);
+                break;
+        case PTRACE_GETREGS: { /* Get all gp regs from the child. */
+                if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
+                        ret = -EIO;
+                        break;
+                }
+                for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
+                        __put_user(getreg(child, i), datap);
+                        datap++;
+                }
+                ret = 0;
+                break;
+        }
+        case PTRACE_SETREGS: { /* Set all gp regs in the child. */
+                unsigned long tmp;
+                if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
+                        ret = -EIO;
+                        break;
+                }
+                for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
+                        __get_user(tmp, datap);
+                        putreg(child, i, tmp);
+                        datap++;
+                }
+                ret = 0;
+                break;
+        }
+        case PTRACE_GETFPREGS: { /* Get the child FPU state. */
+                if (!access_ok(VERIFY_WRITE, datap,
+                               sizeof(struct user_i387_struct))) {
+                        ret = -EIO;
+                        break;
+                }
+                ret = 0;
+                if (!tsk_used_math(child))
+                        init_fpu(child);
+                get_fpregs((struct user_i387_struct __user *)data, child);
+                break;
+        }
+        case PTRACE_SETFPREGS: { /* Set the child FPU state. */
+                if (!access_ok(VERIFY_READ, datap,
+                               sizeof(struct user_i387_struct))) {
+                        ret = -EIO;
+                        break;
+                }
+                set_stopped_child_used_math(child);
+                set_fpregs(child, (struct user_i387_struct __user *)data);
+                ret = 0;
+                break;
+        }
+        case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
+                if (!access_ok(VERIFY_WRITE, datap,
+                               sizeof(struct user_fxsr_struct))) {
+                        ret = -EIO;
+                        break;
+                }
+                if (!tsk_used_math(child))
+                        init_fpu(child);
+                ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
+                break;
+        }
+        case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
+                if (!access_ok(VERIFY_READ, datap,
+                               sizeof(struct user_fxsr_struct))) {
+                        ret = -EIO;
+                        break;
+                }
+                set_stopped_child_used_math(child);
+                ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
+                break;
+        }
+        case PTRACE_GET_THREAD_AREA:
+                ret = ptrace_get_thread_area(child, addr,
+                                        (struct user_desc __user *) data);
+                break;
+        case PTRACE_SET_THREAD_AREA:
+                ret = ptrace_set_thread_area(child, addr,
+                                        (struct user_desc __user *) data);
+                break;
+        default:
+                ret = ptrace_request(child, request, addr, data);
+                break;
+        }
+ out_tsk:
+        return ret;
+}
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
+{
+        struct siginfo info;
+        tsk->thread.trap_no = 1;
+        tsk->thread.error_code = error_code;
+        memset(&info, 0, sizeof(info));
+        info.si_signo = SIGTRAP;
+        info.si_code = TRAP_BRKPT;
+        /* User-mode eip? */
+        info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
+        /* Send us the fakey SIGTRAP */
+        force_sig_info(SIGTRAP, &info, tsk);
+}
+/* notification of system call entry/exit
+ * - triggered by current->work.syscall_trace
+ */
+__attribute__((regparm(3)))
+int do_syscall_trace(struct pt_regs *regs, int entryexit)
+{
+        int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
+        /*
+         * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
+         * interception
+         */
+        int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
+        int ret = 0;
+        /* do the secure computing check first */
+        if (!entryexit)
+                secure_computing(regs->orig_eax);
+        if (unlikely(current->audit_context)) {
+                if (entryexit)
+                        audit_syscall_exit(AUDITSC_RESULT(regs->eax),
+                                                regs->eax);
+                /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
+                 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
+                 * not used, entry.S will call us only on syscall exit, not
+                 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
+                 * calling send_sigtrap() on syscall entry.
+                 *
+                 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
+                 * is_singlestep is false, despite his name, so we will still do
+                 * the correct thing.
+                 */
+                else if (is_singlestep)
+                        goto out;
+        }
+        if (!(current->ptrace & PT_PTRACED))
+                goto out;
+        /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
+         * and then is resumed with SYSEMU_SINGLESTEP, it will come in
+         * here. We have to check this and return */
+        if (is_sysemu && entryexit)
+                return 0;
+        /* Fake a debug trap */
+        if (is_singlestep)
+                send_sigtrap(current, regs, 0);
+        if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
+                goto out;
+        /* the 0x80 provides a way for the tracing parent to distinguish
+           between a syscall stop and SIGTRAP delivery */
+        /* Note that the debugger could change the result of test_thread_flag!*/
+        ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
+        /*
+         * this isn't the same as continuing with a signal, but it will do
+         * for normal use.  strace only continues with a signal if the
+         * stopping signal is not SIGTRAP.  -brl
+         */
+        if (current->exit_code) {
+                send_sig(current->exit_code, current, 1);
+                current->exit_code = 0;
+        }
+        ret = is_sysemu;
+out:
+        if (unlikely(current->audit_context) && !entryexit)
+                audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
+                                    regs->ebx, regs->ecx, regs->edx, regs->esi);
+        if (ret == 0)
+                return 0;
+        regs->orig_eax = -1; /* force skip of syscall restarting */
+        if (unlikely(current->audit_context))
+                audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
+        return 1;
+}
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
new file mode 100644
index 000000000000..eea3702427b4
--- /dev/null
+++ b/arch/x86/kernel/ptrace_64.c
@@ -0,0 +1,627 @@
+/* ptrace.c */
+/* By Ross Biro 1/23/92 */
+/*
+ * Pentium III FXSR, SSE support
+ *      Gareth Hughes <gareth@valinux.com>, May 2000
+ * 
+ * x86-64 port 2000-2002 Andi Kleen
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/security.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/ia32.h>
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+/*
+ * Determines which flags the user has access to [1 = access, 0 = no access].
+ * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
+ * Also masks reserved bits (63-22, 15, 5, 3, 1).
+ */
+#define FLAG_MASK 0x54dd5UL
+/* set's the trap flag. */
+#define TRAP_FLAG 0x100UL
+/*
+ * eflags and offset of eflags on child stack..
+ */
+#define EFLAGS offsetof(struct pt_regs, eflags)
+#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
+/*
+ * this routine will get a word off of the processes privileged stack. 
+ * the offset is how far from the base addr as stored in the TSS.  
+ * this routine assumes that all the privileged stacks are in our
+ * data space.
+ */   
+static inline unsigned long get_stack_long(struct task_struct *task, int offset)
+{
+        unsigned char *stack;
+        stack = (unsigned char *)task->thread.rsp0;
+        stack += offset;
+        return (*((unsigned long *)stack));
+}
+/*
+ * this routine will put a word on the processes privileged stack. 
+ * the offset is how far from the base addr as stored in the TSS.  
+ * this routine assumes that all the privileged stacks are in our
+ * data space.
+ */
+static inline long put_stack_long(struct task_struct *task, int offset,
+        unsigned long data)
+{
+        unsigned char * stack;
+        stack = (unsigned char *) task->thread.rsp0;
+        stack += offset;
+        *(unsigned long *) stack = data;
+        return 0;
+}
+#define LDT_SEGMENT 4
+unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
+{
+        unsigned long addr, seg;
+        addr = regs->rip;
+        seg = regs->cs & 0xffff;
+        /*
+         * We'll assume that the code segments in the GDT
+         * are all zero-based. That is largely true: the
+         * TLS segments are used for data, and the PNPBIOS
+         * and APM bios ones we just ignore here.
+         */
+        if (seg & LDT_SEGMENT) {
+                u32 *desc;
+                unsigned long base;
+                seg &= ~7UL;
+                down(&child->mm->context.sem);
+                if (unlikely((seg >> 3) >= child->mm->context.size))
+                        addr = -1L; /* bogus selector, access would fault */
+                else {
+                        desc = child->mm->context.ldt + seg;
+                        base = ((desc[0] >> 16) |
+                                ((desc[1] & 0xff) << 16) |
+                                (desc[1] & 0xff000000));
+                        /* 16-bit code segment? */
+                        if (!((desc[1] >> 22) & 1))
+                                addr &= 0xffff;
+                        addr += base;
+                }
+                up(&child->mm->context.sem);
+        }
+        return addr;
+}
+static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
+{
+        int i, copied;
+        unsigned char opcode[15];
+        unsigned long addr = convert_rip_to_linear(child, regs);
+        copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
+        for (i = 0; i < copied; i++) {
+                switch (opcode[i]) {
+                /* popf and iret */
+                case 0x9d: case 0xcf:
+                        return 1;
+                        /* CHECKME: 64 65 */
+                /* opcode and address size prefixes */
+                case 0x66: case 0x67:
+                        continue;
+                /* irrelevant prefixes (segment overrides and repeats) */
+                case 0x26: case 0x2e:
+                case 0x36: case 0x3e:
+                case 0x64: case 0x65:
+                case 0xf2: case 0xf3:
+                        continue;
+                case 0x40 ... 0x4f:
+                        if (regs->cs != __USER_CS)
+                                /* 32-bit mode: register increment */
+                                return 0;
+                        /* 64-bit mode: REX prefix */
+                        continue;
+                        /* CHECKME: f2, f3 */
+                /*
+                 * pushf: NOTE! We should probably not let
+                 * the user see the TF bit being set. But
+                 * it's more pain than it's worth to avoid
+                 * it, and a debugger could emulate this
+                 * all in user space if it _really_ cares.
+                 */
+                case 0x9c:
+                default:
+                        return 0;
+                }
+        }
+        return 0;
+}
+static void set_singlestep(struct task_struct *child)
+{
+        struct pt_regs *regs = task_pt_regs(child);
+        /*
+         * Always set TIF_SINGLESTEP - this guarantees that
+         * we single-step system calls etc..  This will also
+         * cause us to set TF when returning to user mode.
+         */
+        set_tsk_thread_flag(child, TIF_SINGLESTEP);
+        /*
+         * If TF was already set, don't do anything else
+         */
+        if (regs->eflags & TRAP_FLAG)
+                return;
+        /* Set TF on the kernel stack.. */
+        regs->eflags |= TRAP_FLAG;
+        /*
+         * ..but if TF is changed by the instruction we will trace,
+         * don't mark it as being "us" that set it, so that we
+         * won't clear it by hand later.
+         */
+        if (is_setting_trap_flag(child, regs))
+                return;
+        child->ptrace |= PT_DTRACE;
+}
+static void clear_singlestep(struct task_struct *child)
+{
+        /* Always clear TIF_SINGLESTEP... */
+        clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+        /* But touch TF only if it was set by us.. */
+        if (child->ptrace & PT_DTRACE) {
+                struct pt_regs *regs = task_pt_regs(child);
+                regs->eflags &= ~TRAP_FLAG;
+                child->ptrace &= ~PT_DTRACE;
+        }
+}
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure the single step bit is not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{ 
+        clear_singlestep(child);
+}
+static int putreg(struct task_struct *child,
+        unsigned long regno, unsigned long value)
+{
+        unsigned long tmp; 
+        
+        switch (regno) {
+                case offsetof(struct user_regs_struct,fs):
+                        if (value && (value & 3) != 3)
+                                return -EIO;
+                        child->thread.fsindex = value & 0xffff; 
+                        return 0;
+                case offsetof(struct user_regs_struct,gs):
+                        if (value && (value & 3) != 3)
+                                return -EIO;
+                        child->thread.gsindex = value & 0xffff;
+                        return 0;
+                case offsetof(struct user_regs_struct,ds):
+                        if (value && (value & 3) != 3)
+                                return -EIO;
+                        child->thread.ds = value & 0xffff;
+                        return 0;
+                case offsetof(struct user_regs_struct,es): 
+                        if (value && (value & 3) != 3)
+                                return -EIO;
+                        child->thread.es = value & 0xffff;
+                        return 0;
+                case offsetof(struct user_regs_struct,ss):
+                        if ((value & 3) != 3)
+                                return -EIO;
+                        value &= 0xffff;
+                        return 0;
+                case offsetof(struct user_regs_struct,fs_base):
+                        if (value >= TASK_SIZE_OF(child))
+                                return -EIO;
+                        child->thread.fs = value;
+                        return 0;
+                case offsetof(struct user_regs_struct,gs_base):
+                        if (value >= TASK_SIZE_OF(child))
+                                return -EIO;
+                        child->thread.gs = value;
+                        return 0;
+                case offsetof(struct user_regs_struct, eflags):
+                        value &= FLAG_MASK;
+                        tmp = get_stack_long(child, EFL_OFFSET); 
+                        tmp &= ~FLAG_MASK; 
+                        value |= tmp;
+                        break;
+                case offsetof(struct user_regs_struct,cs): 
+                        if ((value & 3) != 3)
+                                return -EIO;
+                        value &= 0xffff;
+                        break;
+        }
+        put_stack_long(child, regno - sizeof(struct pt_regs), value);
+        return 0;
+}
+static unsigned long getreg(struct task_struct *child, unsigned long regno)
+{
+        unsigned long val;
+        switch (regno) {
+                case offsetof(struct user_regs_struct, fs):
+                        return child->thread.fsindex;
+                case offsetof(struct user_regs_struct, gs):
+                        return child->thread.gsindex;
+                case offsetof(struct user_regs_struct, ds):
+                        return child->thread.ds;
+                case offsetof(struct user_regs_struct, es):
+                        return child->thread.es; 
+                case offsetof(struct user_regs_struct, fs_base):
+                        return child->thread.fs;
+                case offsetof(struct user_regs_struct, gs_base):
+                        return child->thread.gs;
+                default:
+                        regno = regno - sizeof(struct pt_regs);
+                        val = get_stack_long(child, regno);
+                        if (test_tsk_thread_flag(child, TIF_IA32))
+                                val &= 0xffffffff;
+                        return val;
+        }
+}
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+{
+        long i, ret;
+        unsigned ui;
+        switch (request) {
+        /* when I and D space are separate, these will need to be fixed. */
+        case PTRACE_PEEKTEXT: /* read word at location addr. */ 
+        case PTRACE_PEEKDATA:
+                ret = generic_ptrace_peekdata(child, addr, data);
+                break;
+        /* read the word at location addr in the USER area. */
+        case PTRACE_PEEKUSR: {
+                unsigned long tmp;
+                ret = -EIO;
+                if ((addr & 7) ||
+                    addr > sizeof(struct user) - 7)
+                        break;
+                switch (addr) { 
+                case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
+                        tmp = getreg(child, addr);
+                        break;
+                case offsetof(struct user, u_debugreg[0]):
+                        tmp = child->thread.debugreg0;
+                        break;
+                case offsetof(struct user, u_debugreg[1]):
+                        tmp = child->thread.debugreg1;
+                        break;
+                case offsetof(struct user, u_debugreg[2]):
+                        tmp = child->thread.debugreg2;
+                        break;
+                case offsetof(struct user, u_debugreg[3]):
+                        tmp = child->thread.debugreg3;
+                        break;
+                case offsetof(struct user, u_debugreg[6]):
+                        tmp = child->thread.debugreg6;
+                        break;
+                case offsetof(struct user, u_debugreg[7]):
+                        tmp = child->thread.debugreg7;
+                        break;
+                default:
+                        tmp = 0;
+                        break;
+                }
+                ret = put_user(tmp,(unsigned long __user *) data);
+                break;
+        }
+        /* when I and D space are separate, this will have to be fixed. */
+        case PTRACE_POKETEXT: /* write the word at location addr. */
+        case PTRACE_POKEDATA:
+                ret = generic_ptrace_pokedata(child, addr, data);
+                break;
+        case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
+        {
+                int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
+                ret = -EIO;
+                if ((addr & 7) ||
+                    addr > sizeof(struct user) - 7)
+                        break;
+                switch (addr) { 
+                case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
+                        ret = putreg(child, addr, data);
+                        break;
+                /* Disallows to set a breakpoint into the vsyscall */
+                case offsetof(struct user, u_debugreg[0]):
+                        if (data >= TASK_SIZE_OF(child) - dsize) break;
+                        child->thread.debugreg0 = data;
+                        ret = 0;
+                        break;
+                case offsetof(struct user, u_debugreg[1]):
+                        if (data >= TASK_SIZE_OF(child) - dsize) break;
+                        child->thread.debugreg1 = data;
+                        ret = 0;
+                        break;
+                case offsetof(struct user, u_debugreg[2]):
+                        if (data >= TASK_SIZE_OF(child) - dsize) break;
+                        child->thread.debugreg2 = data;
+                        ret = 0;
+                        break;
+                case offsetof(struct user, u_debugreg[3]):
+                        if (data >= TASK_SIZE_OF(child) - dsize) break;
+                        child->thread.debugreg3 = data;
+                        ret = 0;
+                        break;
+                case offsetof(struct user, u_debugreg[6]):
+                                  if (data >> 32)
+                                break; 
+                        child->thread.debugreg6 = data;
+                        ret = 0;
+                        break;
+                case offsetof(struct user, u_debugreg[7]):
+                        /* See arch/i386/kernel/ptrace.c for an explanation of
+                         * this awkward check.*/
+                        data &= ~DR_CONTROL_RESERVED;
+                        for(i=0; i<4; i++)
+                                if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
+                                        break;
+                        if (i == 4) {
+                          child->thread.debugreg7 = data;
+                          if (data)
+                                set_tsk_thread_flag(child, TIF_DEBUG);
+                          else
+                                clear_tsk_thread_flag(child, TIF_DEBUG);
+                          ret = 0;
+                        }
+                  break;
+                }
+                break;
+        }
+        case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
+        case PTRACE_CONT:    /* restart after signal. */
+                ret = -EIO;
+                if (!valid_signal(data))
+                        break;
+                if (request == PTRACE_SYSCALL)
+                        set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
+                else
+                        clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
+                clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+                child->exit_code = data;
+                /* make sure the single step bit is not set. */
+                clear_singlestep(child);
+                wake_up_process(child);
+                ret = 0;
+                break;
+#ifdef CONFIG_IA32_EMULATION
+                /* This makes only sense with 32bit programs. Allow a
+                   64bit debugger to fully examine them too. Better
+                   don't use it against 64bit processes, use
+                   PTRACE_ARCH_PRCTL instead. */
+        case PTRACE_SET_THREAD_AREA: {
+                struct user_desc __user *p;
+                int old; 
+                p = (struct user_desc __user *)data;
+                get_user(old,  &p->entry_number); 
+                put_user(addr, &p->entry_number);
+                ret = do_set_thread_area(&child->thread, p);
+                put_user(old,  &p->entry_number); 
+                break;
+        case PTRACE_GET_THREAD_AREA:
+                p = (struct user_desc __user *)data;
+                get_user(old,  &p->entry_number); 
+                put_user(addr, &p->entry_number);
+                ret = do_get_thread_area(&child->thread, p);
+                put_user(old,  &p->entry_number); 
+                break;
+        } 
+#endif
+                /* normal 64bit interface to access TLS data. 
+                   Works just like arch_prctl, except that the arguments
+                   are reversed. */
+        case PTRACE_ARCH_PRCTL: 
+                ret = do_arch_prctl(child, data, addr);
+                break;
+/*
+ * make the child exit.  Best I can do is send it a sigkill. 
+ * perhaps it should be put in the status that it wants to 
+ * exit.
+ */
+        case PTRACE_KILL:
+                ret = 0;
+                if (child->exit_state == EXIT_ZOMBIE)   /* already dead */
+                        break;
+                clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+                child->exit_code = SIGKILL;
+                /* make sure the single step bit is not set. */
+                clear_singlestep(child);
+                wake_up_process(child);
+                break;
+        case PTRACE_SINGLESTEP:    /* set the trap flag. */
+                ret = -EIO;
+                if (!valid_signal(data))
+                        break;
+                clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
+                set_singlestep(child);
+                child->exit_code = data;
+                /* give it a chance to run. */
+                wake_up_process(child);
+                ret = 0;
+                break;
+        case PTRACE_DETACH:
+                /* detach a process that was attached. */
+                ret = ptrace_detach(child, data);
+                break;
+        case PTRACE_GETREGS: { /* Get all gp regs from the child. */
+                if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
+                               sizeof(struct user_regs_struct))) {
+                        ret = -EIO;
+                        break;
+                }
+                ret = 0;
+                for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
+                        ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
+                        data += sizeof(long);
+                }
+                break;
+        }
+        case PTRACE_SETREGS: { /* Set all gp regs in the child. */
+                unsigned long tmp;
+                if (!access_ok(VERIFY_READ, (unsigned __user *)data,
+                               sizeof(struct user_regs_struct))) {
+                        ret = -EIO;
+                        break;
+                }
+                ret = 0;
+                for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
+                        ret = __get_user(tmp, (unsigned long __user *) data);
+                        if (ret)
+                                break;
+                        ret = putreg(child, ui, tmp);
+                        if (ret)
+                                break;
+                        data += sizeof(long);
+                }
+                break;
+        }
+        case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
+                if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
+                               sizeof(struct user_i387_struct))) {
+                        ret = -EIO;
+                        break;
+                }
+                ret = get_fpregs((struct user_i387_struct __user *)data, child);
+                break;
+        }
+        case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
+                if (!access_ok(VERIFY_READ, (unsigned __user *)data,
+                               sizeof(struct user_i387_struct))) {
+                        ret = -EIO;
+                        break;
+                }
+                set_stopped_child_used_math(child);
+                ret = set_fpregs(child, (struct user_i387_struct __user *)data);
+                break;
+        }
+        default:
+                ret = ptrace_request(child, request, addr, data);
+                break;
+        }
+        return ret;
+}
+static void syscall_trace(struct pt_regs *regs)
+{
+#if 0
+        printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
+               current->comm,
+               regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
+               current_thread_info()->flags, current->ptrace); 
+#endif
+        ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
+                                ? 0x80 : 0));
+        /*
+         * this isn't the same as continuing with a signal, but it will do
+         * for normal use.  strace only continues with a signal if the
+         * stopping signal is not SIGTRAP.  -brl
+         */
+        if (current->exit_code) {
+                send_sig(current->exit_code, current, 1);
+                current->exit_code = 0;
+        }
+}
+asmlinkage void syscall_trace_enter(struct pt_regs *regs)
+{
+        /* do the secure computing check first */
+        secure_computing(regs->orig_rax);
+        if (test_thread_flag(TIF_SYSCALL_TRACE)
+            && (current->ptrace & PT_PTRACED))
+                syscall_trace(regs);
+        if (unlikely(current->audit_context)) {
+                if (test_thread_flag(TIF_IA32)) {
+                        audit_syscall_entry(AUDIT_ARCH_I386,
+                                            regs->orig_rax,
+                                            regs->rbx, regs->rcx,
+                                            regs->rdx, regs->rsi);
+                } else {
+                        audit_syscall_entry(AUDIT_ARCH_X86_64,
+                                            regs->orig_rax,
+                                            regs->rdi, regs->rsi,
+                                            regs->rdx, regs->r10);
+                }
+        }
+}
+asmlinkage void syscall_trace_leave(struct pt_regs *regs)
+{
+        if (unlikely(current->audit_context))
+                audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
+        if ((test_thread_flag(TIF_SYSCALL_TRACE)
+             || test_thread_flag(TIF_SINGLESTEP))
+            && (current->ptrace & PT_PTRACED))
+                syscall_trace(regs);
+}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
new file mode 100644
index 000000000000..6722469c2633
--- /dev/null
+++ b/arch/x86/kernel/quirks.c
@@ -0,0 +1,49 @@
+/*
+ * This file contains work-arounds for x86 and x86_64 platform bugs.
+ */
+#include <linux/pci.h>
+#include <linux/irq.h>
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
+static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
+{
+        u8 config, rev;
+        u32 word;
+        /* BIOS may enable hardware IRQ balancing for
+         * E7520/E7320/E7525(revision ID 0x9 and below)
+         * based platforms.
+         * Disable SW irqbalance/affinity on those platforms.
+         */
+        pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+        if (rev > 0x9)
+                return;
+        /* enable access to config space*/
+        pci_read_config_byte(dev, 0xf4, &config);
+        pci_write_config_byte(dev, 0xf4, config|0x2);
+        /* read xTPR register */
+        raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
+        if (!(word & (1 << 13))) {
+                printk(KERN_INFO "Intel E7520/7320/7525 detected. "
+                        "Disabling irq balancing and affinity\n");
+#ifdef CONFIG_IRQBALANCE
+                irqbalance_disable("");
+#endif
+                noirqdebug_setup("");
+#ifdef CONFIG_PROC_FS
+                no_irq_affinity = 1;
+#endif
+        }
+        /* put back the original value for config space*/
+        if (!(config & 0x2))
+                pci_write_config_byte(dev, 0xf4, config);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,    PCI_DEVICE_ID_INTEL_E7320_MCH,  quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,    PCI_DEVICE_ID_INTEL_E7525_MCH,  quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,    PCI_DEVICE_ID_INTEL_E7520_MCH,  quirk_intel_irqbalance);
+#endif
diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot_32.c
new file mode 100644
index 000000000000..b37ed226830a
--- /dev/null
+++ b/arch/x86/kernel/reboot_32.c
@@ -0,0 +1,413 @@
+/*
+ *  linux/arch/i386/kernel/reboot.c
+ */
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/efi.h>
+#include <linux/dmi.h>
+#include <linux/ctype.h>
+#include <linux/pm.h>
+#include <linux/reboot.h>
+#include <asm/uaccess.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include "mach_reboot.h"
+#include <asm/reboot_fixups.h>
+#include <asm/reboot.h>
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+static int reboot_mode;
+static int reboot_thru_bios;
+#ifdef CONFIG_SMP
+static int reboot_cpu = -1;
+#endif
+static int __init reboot_setup(char *str)
+{
+        while(1) {
+                switch (*str) {
+                case 'w': /* "warm" reboot (no memory testing etc) */
+                        reboot_mode = 0x1234;
+                        break;
+                case 'c': /* "cold" reboot (with memory testing etc) */
+                        reboot_mode = 0x0;
+                        break;
+                case 'b': /* "bios" reboot by jumping through the BIOS */
+                        reboot_thru_bios = 1;
+                        break;
+                case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
+                        reboot_thru_bios = 0;
+                        break;
+#ifdef CONFIG_SMP
+                case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
+                        if (isdigit(*(str+1))) {
+                                reboot_cpu = (int) (*(str+1) - '0');
+                                if (isdigit(*(str+2)))
+                                        reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
+                        }
+                                /* we will leave sorting out the final value 
+                                when we are ready to reboot, since we might not
+                                have set up boot_cpu_id or smp_num_cpu */
+                        break;
+#endif
+                }
+                if((str = strchr(str,',')) != NULL)
+                        str++;
+                else
+                        break;
+        }
+        return 1;
+}
+__setup("reboot=", reboot_setup);
+/*
+ * Reboot options and system auto-detection code provided by
+ * Dell Inc. so their systems "just work". :-)
+ */
+/*
+ * Some machines require the "reboot=b"  commandline option, this quirk makes that automatic.
+ */
+static int __init set_bios_reboot(const struct dmi_system_id *d)
+{
+        if (!reboot_thru_bios) {
+                reboot_thru_bios = 1;
+                printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
+        }
+        return 0;
+}
+static struct dmi_system_id __initdata reboot_dmi_table[] = {
+        {       /* Handle problems with rebooting on Dell E520's */
+                .callback = set_bios_reboot,
+                .ident = "Dell E520",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
+                },
+        },
+        {       /* Handle problems with rebooting on Dell 1300's */
+                .callback = set_bios_reboot,
+                .ident = "Dell PowerEdge 1300",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
+                },
+        },
+        {       /* Handle problems with rebooting on Dell 300's */
+                .callback = set_bios_reboot,
+                .ident = "Dell PowerEdge 300",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
+                },
+        },
+        {       /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
+                .callback = set_bios_reboot,
+                .ident = "Dell OptiPlex 745",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+                        DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
+                },
+        },
+        {       /* Handle problems with rebooting on Dell 2400's */
+                .callback = set_bios_reboot,
+                .ident = "Dell PowerEdge 2400",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
+                },
+        },
+        {       /* Handle problems with rebooting on HP laptops */
+                .callback = set_bios_reboot,
+                .ident = "HP Compaq Laptop",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
+                },
+        },
+        { }
+};
+static int __init reboot_init(void)
+{
+        dmi_check_system(reboot_dmi_table);
+        return 0;
+}
+core_initcall(reboot_init);
+/* The following code and data reboots the machine by switching to real
+   mode and jumping to the BIOS reset entry point, as if the CPU has
+   really been reset.  The previous version asked the keyboard
+   controller to pulse the CPU reset line, which is more thorough, but
+   doesn't work with at least one type of 486 motherboard.  It is easy
+   to stop this code working; hence the copious comments. */
+static unsigned long long
+real_mode_gdt_entries [3] =
+{
+        0x0000000000000000ULL,  /* Null descriptor */
+        0x00009a000000ffffULL,  /* 16-bit real-mode 64k code at 0x00000000 */
+        0x000092000100ffffULL   /* 16-bit real-mode 64k data at 0x00000100 */
+};
+static struct Xgt_desc_struct
+real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
+real_mode_idt = { 0x3ff, 0 },
+no_idt = { 0, 0 };
+/* This is 16-bit protected mode code to disable paging and the cache,
+   switch to real mode and jump to the BIOS reset code.
+   The instruction that switches to real mode by writing to CR0 must be
+   followed immediately by a far jump instruction, which set CS to a
+   valid value for real mode, and flushes the prefetch queue to avoid
+   running instructions that have already been decoded in protected
+   mode.
+   Clears all the flags except ET, especially PG (paging), PE
+   (protected-mode enable) and TS (task switch for coprocessor state
+   save).  Flushes the TLB after paging has been disabled.  Sets CD and
+   NW, to disable the cache on a 486, and invalidates the cache.  This
+   is more like the state of a 486 after reset.  I don't know if
+   something else should be done for other chips.
+   More could be done here to set up the registers as if a CPU reset had
+   occurred; hopefully real BIOSs don't assume much. */
+static unsigned char real_mode_switch [] =
+{
+        0x66, 0x0f, 0x20, 0xc0,                 /*    movl  %cr0,%eax        */
+        0x66, 0x83, 0xe0, 0x11,                 /*    andl  $0x00000011,%eax */
+        0x66, 0x0d, 0x00, 0x00, 0x00, 0x60,     /*    orl   $0x60000000,%eax */
+        0x66, 0x0f, 0x22, 0xc0,                 /*    movl  %eax,%cr0        */
+        0x66, 0x0f, 0x22, 0xd8,                 /*    movl  %eax,%cr3        */
+        0x66, 0x0f, 0x20, 0xc3,                 /*    movl  %cr0,%ebx        */
+        0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60,       /*    andl  $0x60000000,%ebx */
+        0x74, 0x02,                             /*    jz    f                */
+        0x0f, 0x09,                             /*    wbinvd                 */
+        0x24, 0x10,                             /* f: andb  $0x10,al         */
+        0x66, 0x0f, 0x22, 0xc0                  /*    movl  %eax,%cr0        */
+};
+static unsigned char jump_to_bios [] =
+{
+        0xea, 0x00, 0x00, 0xff, 0xff            /*    ljmp  $0xffff,$0x0000  */
+};
+/*
+ * Switch to real mode and then execute the code
+ * specified by the code and length parameters.
+ * We assume that length will aways be less that 100!
+ */
+void machine_real_restart(unsigned char *code, int length)
+{
+        local_irq_disable();
+        /* Write zero to CMOS register number 0x0f, which the BIOS POST
+           routine will recognize as telling it to do a proper reboot.  (Well
+           that's what this book in front of me says -- it may only apply to
+           the Phoenix BIOS though, it's not clear).  At the same time,
+           disable NMIs by setting the top bit in the CMOS address register,
+           as we're about to do peculiar things to the CPU.  I'm not sure if
+           `outb_p' is needed instead of just `outb'.  Use it to be on the
+           safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.)
+         */
+        spin_lock(&rtc_lock);
+        CMOS_WRITE(0x00, 0x8f);
+        spin_unlock(&rtc_lock);
+        /* Remap the kernel at virtual address zero, as well as offset zero
+           from the kernel segment.  This assumes the kernel segment starts at
+           virtual address PAGE_OFFSET. */
+        memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+                sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
+        /*
+         * Use `swapper_pg_dir' as our page directory.
+         */
+        load_cr3(swapper_pg_dir);
+        /* Write 0x1234 to absolute memory location 0x472.  The BIOS reads
+           this on booting to tell it to "Bypass memory test (also warm
+           boot)".  This seems like a fairly standard thing that gets set by
+           REBOOT.COM programs, and the previous reset routine did this
+           too. */
+        *((unsigned short *)0x472) = reboot_mode;
+        /* For the switch to real mode, copy some code to low memory.  It has
+           to be in the first 64k because it is running in 16-bit mode, and it
+           has to have the same physical and virtual address, because it turns
+           off paging.  Copy it near the end of the first page, out of the way
+           of BIOS variables. */
+        memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
+                real_mode_switch, sizeof (real_mode_switch));
+        memcpy ((void *) (0x1000 - 100), code, length);
+        /* Set up the IDT for real mode. */
+        load_idt(&real_mode_idt);
+        /* Set up a GDT from which we can load segment descriptors for real
+           mode.  The GDT is not used in real mode; it is just needed here to
+           prepare the descriptors. */
+        load_gdt(&real_mode_gdt);
+        /* Load the data segment registers, and thus the descriptors ready for
+           real mode.  The base address of each segment is 0x100, 16 times the
+           selector value being loaded here.  This is so that the segment
+           registers don't have to be reloaded after switching to real mode:
+           the values are consistent for real mode operation already. */
+        __asm__ __volatile__ ("movl $0x0010,%%eax\n"
+                                "\tmovl %%eax,%%ds\n"
+                                "\tmovl %%eax,%%es\n"
+                                "\tmovl %%eax,%%fs\n"
+                                "\tmovl %%eax,%%gs\n"
+                                "\tmovl %%eax,%%ss" : : : "eax");
+        /* Jump to the 16-bit code that we copied earlier.  It disables paging
+           and the cache, switches to real mode, and jumps to the BIOS reset
+           entry point. */
+        __asm__ __volatile__ ("ljmp $0x0008,%0"
+                                :
+                                : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(machine_real_restart);
+#endif
+static void native_machine_shutdown(void)
+{
+#ifdef CONFIG_SMP
+        int reboot_cpu_id;
+        /* The boot cpu is always logical cpu 0 */
+        reboot_cpu_id = 0;
+        /* See if there has been given a command line override */
+        if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
+                cpu_isset(reboot_cpu, cpu_online_map)) {
+                reboot_cpu_id = reboot_cpu;
+        }
+        /* Make certain the cpu I'm rebooting on is online */
+        if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+                reboot_cpu_id = smp_processor_id();
+        }
+        /* Make certain I only run on the appropriate processor */
+        set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+        /* O.K. Now that I'm on the appropriate processor, stop
+         * all of the others, and disable their local APICs.
+         */
+        smp_send_stop();
+#endif /* CONFIG_SMP */
+        lapic_shutdown();
+#ifdef CONFIG_X86_IO_APIC
+        disable_IO_APIC();
+#endif
+}
+void __attribute__((weak)) mach_reboot_fixups(void)
+{
+}
+static void native_machine_emergency_restart(void)
+{
+        if (!reboot_thru_bios) {
+                if (efi_enabled) {
+                        efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
+                        load_idt(&no_idt);
+                        __asm__ __volatile__("int3");
+                }
+                /* rebooting needs to touch the page at absolute addr 0 */
+                *((unsigned short *)__va(0x472)) = reboot_mode;
+                for (;;) {
+                        mach_reboot_fixups(); /* for board specific fixups */
+                        mach_reboot();
+                        /* That didn't work - force a triple fault.. */
+                        load_idt(&no_idt);
+                        __asm__ __volatile__("int3");
+                }
+        }
+        if (efi_enabled)
+                efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL);
+        machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
+}
+static void native_machine_restart(char * __unused)
+{
+        machine_shutdown();
+        machine_emergency_restart();
+}
+static void native_machine_halt(void)
+{
+}
+static void native_machine_power_off(void)
+{
+        if (pm_power_off) {
+                machine_shutdown();
+                pm_power_off();
+        }
+}
+struct machine_ops machine_ops = {
+        .power_off = native_machine_power_off,
+        .shutdown = native_machine_shutdown,
+        .emergency_restart = native_machine_emergency_restart,
+        .restart = native_machine_restart,
+        .halt = native_machine_halt,
+};
+void machine_power_off(void)
+{
+        machine_ops.power_off();
+}
+void machine_shutdown(void)
+{
+        machine_ops.shutdown();
+}
+void machine_emergency_restart(void)
+{
+        machine_ops.emergency_restart();
+}
+void machine_restart(char *cmd)
+{
+        machine_ops.restart(cmd);
+}
+void machine_halt(void)
+{
+        machine_ops.halt();
+}
diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c
new file mode 100644
index 000000000000..368db2b9c5ac
--- /dev/null
+++ b/arch/x86/kernel/reboot_64.c
@@ -0,0 +1,171 @@
+/* Various gunk just to reboot the machine. */ 
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/pm.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <asm/io.h>
+#include <asm/delay.h>
+#include <asm/hw_irq.h>
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/apic.h>
+#include <asm/iommu.h>
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+static long no_idt[3];
+static enum { 
+        BOOT_TRIPLE = 't',
+        BOOT_KBD = 'k'
+} reboot_type = BOOT_KBD;
+static int reboot_mode = 0;
+int reboot_force;
+/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
+   warm   Don't set the cold reboot flag
+   cold   Set the cold reboot flag
+   triple Force a triple fault (init)
+   kbd    Use the keyboard controller. cold reset (default)
+   force  Avoid anything that could hang.
+ */ 
+static int __init reboot_setup(char *str)
+{
+        for (;;) {
+                switch (*str) {
+                case 'w': 
+                        reboot_mode = 0x1234;
+                        break;
+                case 'c':
+                        reboot_mode = 0;
+                        break;
+                case 't':
+                case 'b':
+                case 'k':
+                        reboot_type = *str;
+                        break;
+                case 'f':
+                        reboot_force = 1;
+                        break;
+                }
+                if((str = strchr(str,',')) != NULL)
+                        str++;
+                else
+                        break;
+        }
+        return 1;
+}
+__setup("reboot=", reboot_setup);
+static inline void kb_wait(void)
+{
+        int i;
+        for (i=0; i<0x10000; i++)
+                if ((inb_p(0x64) & 0x02) == 0)
+                        break;
+}
+void machine_shutdown(void)
+{
+        unsigned long flags;
+        /* Stop the cpus and apics */
+#ifdef CONFIG_SMP
+        int reboot_cpu_id;
+        /* The boot cpu is always logical cpu 0 */
+        reboot_cpu_id = 0;
+        /* Make certain the cpu I'm about to reboot on is online */
+        if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+                reboot_cpu_id = smp_processor_id();
+        }
+        /* Make certain I only run on the appropriate processor */
+        set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+        /* O.K Now that I'm on the appropriate processor,
+         * stop all of the others.
+         */
+        smp_send_stop();
+#endif
+        local_irq_save(flags);
+#ifndef CONFIG_SMP
+        disable_local_APIC();
+#endif
+        disable_IO_APIC();
+        local_irq_restore(flags);
+        pci_iommu_shutdown();
+}
+void machine_emergency_restart(void)
+{
+        int i;
+        /* Tell the BIOS if we want cold or warm reboot */
+        *((unsigned short *)__va(0x472)) = reboot_mode;
+       
+        for (;;) {
+                /* Could also try the reset bit in the Hammer NB */
+                switch (reboot_type) { 
+                case BOOT_KBD:
+                for (i=0; i<10; i++) {
+                        kb_wait();
+                        udelay(50);
+                        outb(0xfe,0x64);         /* pulse reset low */
+                        udelay(50);
+                }
+                case BOOT_TRIPLE: 
+                        __asm__ __volatile__("lidt (%0)": :"r" (&no_idt));
+                        __asm__ __volatile__("int3");
+                        reboot_type = BOOT_KBD;
+                        break;
+                }      
+        }      
+}
+void machine_restart(char * __unused)
+{
+        printk("machine restart\n");
+        if (!reboot_force) {
+                machine_shutdown();
+        }
+        machine_emergency_restart();
+}
+void machine_halt(void)
+{
+}
+void machine_power_off(void)
+{
+        if (pm_power_off) {
+                if (!reboot_force) {
+                        machine_shutdown();
+                }
+                pm_power_off();
+        }
+}
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
new file mode 100644
index 000000000000..03e1cce58f49
--- /dev/null
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -0,0 +1,68 @@
+/*
+ * linux/arch/i386/kernel/reboot_fixups.c
+ *
+ * This is a good place to put board specific reboot fixups.
+ *
+ * List of supported fixups:
+ * geode-gx1/cs5530a - Jaya Kumar <jayalk@intworks.biz>
+ * geode-gx/lx/cs5536 - Andres Salomon <dilinger@debian.org>
+ *
+ */
+#include <asm/delay.h>
+#include <linux/pci.h>
+#include <asm/reboot_fixups.h>
+#include <asm/msr.h>
+static void cs5530a_warm_reset(struct pci_dev *dev)
+{
+        /* writing 1 to the reset control register, 0x44 causes the
+        cs5530a to perform a system warm reset */
+        pci_write_config_byte(dev, 0x44, 0x1);
+        udelay(50); /* shouldn't get here but be safe and spin-a-while */
+        return;
+}
+static void cs5536_warm_reset(struct pci_dev *dev)
+{
+        /*
+         * 6.6.2.12 Soft Reset (DIVIL_SOFT_RESET)
+         * writing 1 to the LSB of this MSR causes a hard reset.
+         */
+        wrmsrl(0x51400017, 1ULL);
+        udelay(50); /* shouldn't get here but be safe and spin a while */
+}
+struct device_fixup {
+        unsigned int vendor;
+        unsigned int device;
+        void (*reboot_fixup)(struct pci_dev *);
+};
+static struct device_fixup fixups_table[] = {
+{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
+{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
+};
+/*
+ * we see if any fixup is available for our current hardware. if there
+ * is a fixup, we call it and we expect to never return from it. if we
+ * do return, we keep looking and then eventually fall back to the
+ * standard mach_reboot on return.
+ */
+void mach_reboot_fixups(void)
+{
+        struct device_fixup *cur;
+        struct pci_dev *dev;
+        int i;
+        for (i=0; i < ARRAY_SIZE(fixups_table); i++) {
+                cur = &(fixups_table[i]);
+                dev = pci_get_device(cur->vendor, cur->device, NULL);
+                if (!dev)
+                        continue;
+                cur->reboot_fixup(dev);
+        }
+}
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
new file mode 100644
index 000000000000..f151d6fae462
--- /dev/null
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -0,0 +1,252 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/kexec.h>
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+#define PTR(x) (x << 2)
+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
+#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
+        .text
+        .align PAGE_ALIGNED
+        .globl relocate_kernel
+relocate_kernel:
+        movl    8(%esp), %ebp /* list of pages */
+#ifdef CONFIG_X86_PAE
+        /* map the control page at its virtual address */
+        movl    PTR(VA_PGD)(%ebp), %edi
+        movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
+        andl    $0xc0000000, %eax
+        shrl    $27, %eax
+        addl    %edi, %eax
+        movl    PTR(PA_PMD_0)(%ebp), %edx
+        orl     $PAE_PGD_ATTR, %edx
+        movl    %edx, (%eax)
+        movl    PTR(VA_PMD_0)(%ebp), %edi
+        movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
+        andl    $0x3fe00000, %eax
+        shrl    $18, %eax
+        addl    %edi, %eax
+        movl    PTR(PA_PTE_0)(%ebp), %edx
+        orl     $PAGE_ATTR, %edx
+        movl    %edx, (%eax)
+        movl    PTR(VA_PTE_0)(%ebp), %edi
+        movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
+        andl    $0x001ff000, %eax
+        shrl    $9, %eax
+        addl    %edi, %eax
+        movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
+        orl     $PAGE_ATTR, %edx
+        movl    %edx, (%eax)
+        /* identity map the control page at its physical address */
+        movl    PTR(VA_PGD)(%ebp), %edi
+        movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
+        andl    $0xc0000000, %eax
+        shrl    $27, %eax
+        addl    %edi, %eax
+        movl    PTR(PA_PMD_1)(%ebp), %edx
+        orl     $PAE_PGD_ATTR, %edx
+        movl    %edx, (%eax)
+        movl    PTR(VA_PMD_1)(%ebp), %edi
+        movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
+        andl    $0x3fe00000, %eax
+        shrl    $18, %eax
+        addl    %edi, %eax
+        movl    PTR(PA_PTE_1)(%ebp), %edx
+        orl     $PAGE_ATTR, %edx
+        movl    %edx, (%eax)
+        movl    PTR(VA_PTE_1)(%ebp), %edi
+        movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
+        andl    $0x001ff000, %eax
+        shrl    $9, %eax
+        addl    %edi, %eax
+        movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
+        orl     $PAGE_ATTR, %edx
+        movl    %edx, (%eax)
+#else
+        /* map the control page at its virtual address */
+        movl    PTR(VA_PGD)(%ebp), %edi
+        movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
+        andl    $0xffc00000, %eax
+        shrl    $20, %eax
+        addl    %edi, %eax
+        movl    PTR(PA_PTE_0)(%ebp), %edx
+        orl     $PAGE_ATTR, %edx
+        movl    %edx, (%eax)
+        movl    PTR(VA_PTE_0)(%ebp), %edi
+        movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
+        andl    $0x003ff000, %eax
+        shrl    $10, %eax
+        addl    %edi, %eax
+        movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
+        orl     $PAGE_ATTR, %edx
+        movl    %edx, (%eax)
+        /* identity map the control page at its physical address */
+        movl    PTR(VA_PGD)(%ebp), %edi
+        movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
+        andl    $0xffc00000, %eax
+        shrl    $20, %eax
+        addl    %edi, %eax
+        movl    PTR(PA_PTE_1)(%ebp), %edx
+        orl     $PAGE_ATTR, %edx
+        movl    %edx, (%eax)
+        movl    PTR(VA_PTE_1)(%ebp), %edi
+        movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
+        andl    $0x003ff000, %eax
+        shrl    $10, %eax
+        addl    %edi, %eax
+        movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
+        orl     $PAGE_ATTR, %edx
+        movl    %edx, (%eax)
+#endif
+relocate_new_kernel:
+        /* read the arguments and say goodbye to the stack */
+        movl  4(%esp), %ebx /* page_list */
+        movl  8(%esp), %ebp /* list of pages */
+        movl  12(%esp), %edx /* start address */
+        movl  16(%esp), %ecx /* cpu_has_pae */
+        /* zero out flags, and disable interrupts */
+        pushl $0
+        popfl
+        /* get physical address of control page now */
+        /* this is impossible after page table switch */
+        movl    PTR(PA_CONTROL_PAGE)(%ebp), %edi
+        /* switch to new set of page tables */
+        movl    PTR(PA_PGD)(%ebp), %eax
+        movl    %eax, %cr3
+        /* setup a new stack at the end of the physical control page */
+        lea     4096(%edi), %esp
+        /* jump to identity mapped page */
+        movl    %edi, %eax
+        addl    $(identity_mapped - relocate_kernel), %eax
+        pushl   %eax
+        ret
+identity_mapped:
+        /* store the start address on the stack */
+        pushl   %edx
+        /* Set cr0 to a known state:
+         * 31 0 == Paging disabled
+         * 18 0 == Alignment check disabled
+         * 16 0 == Write protect disabled
+         * 3  0 == No task switch
+         * 2  0 == Don't do FP software emulation.
+         * 0  1 == Proctected mode enabled
+         */
+        movl    %cr0, %eax
+        andl    $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
+        orl     $(1<<0), %eax
+        movl    %eax, %cr0
+        /* clear cr4 if applicable */
+        testl   %ecx, %ecx
+        jz      1f
+        /* Set cr4 to a known state:
+         * Setting everything to zero seems safe.
+         */
+        movl    %cr4, %eax
+        andl    $0, %eax
+        movl    %eax, %cr4
+        jmp 1f
+1:
+        /* Flush the TLB (needed?) */
+        xorl    %eax, %eax
+        movl    %eax, %cr3
+        /* Do the copies */
+        movl    %ebx, %ecx
+        jmp     1f
+0:      /* top, read another word from the indirection page */
+        movl    (%ebx), %ecx
+        addl    $4, %ebx
+1:
+        testl   $0x1,   %ecx  /* is it a destination page */
+        jz      2f
+        movl    %ecx,   %edi
+        andl    $0xfffff000, %edi
+        jmp     0b
+2:
+        testl   $0x2,   %ecx  /* is it an indirection page */
+        jz      2f
+        movl    %ecx,   %ebx
+        andl    $0xfffff000, %ebx
+        jmp     0b
+2:
+        testl   $0x4,   %ecx /* is it the done indicator */
+        jz      2f
+        jmp     3f
+2:
+        testl   $0x8,   %ecx /* is it the source indicator */
+        jz      0b           /* Ignore it otherwise */
+        movl    %ecx,   %esi /* For every source page do a copy */
+        andl    $0xfffff000, %esi
+        movl    $1024, %ecx
+        rep ; movsl
+        jmp     0b
+3:
+        /* To be certain of avoiding problems with self-modifying code
+         * I need to execute a serializing instruction here.
+         * So I flush the TLB, it's handy, and not processor dependent.
+         */
+        xorl    %eax, %eax
+        movl    %eax, %cr3
+        /* set all of the registers to known values */
+        /* leave %esp alone */
+        xorl    %eax, %eax
+        xorl    %ebx, %ebx
+        xorl    %ecx, %ecx
+        xorl    %edx, %edx
+        xorl    %esi, %esi
+        xorl    %edi, %edi
+        xorl    %ebp, %ebp
+        ret
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
new file mode 100644
index 000000000000..14e95872c6a3
--- /dev/null
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -0,0 +1,276 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/kexec.h>
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+#define PTR(x) (x << 3)
+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
+        .text
+        .align PAGE_ALIGNED
+        .code64
+        .globl relocate_kernel
+relocate_kernel:
+        /* %rdi indirection_page
+         * %rsi page_list
+         * %rdx start address
+         */
+        /* map the control page at its virtual address */
+        movq    $0x0000ff8000000000, %r10        /* mask */
+        mov     $(39 - 3), %cl                   /* bits to shift */
+        movq    PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
+        movq    %r11, %r9
+        andq    %r10, %r9
+        shrq    %cl, %r9
+        movq    PTR(VA_PGD)(%rsi), %r8
+        addq    %r8, %r9
+        movq    PTR(PA_PUD_0)(%rsi), %r8
+        orq     $PAGE_ATTR, %r8
+        movq    %r8, (%r9)
+        shrq    $9, %r10
+        sub     $9, %cl
+        movq    %r11, %r9
+        andq    %r10, %r9
+        shrq    %cl, %r9
+        movq    PTR(VA_PUD_0)(%rsi), %r8
+        addq    %r8, %r9
+        movq    PTR(PA_PMD_0)(%rsi), %r8
+        orq     $PAGE_ATTR, %r8
+        movq    %r8, (%r9)
+        shrq    $9, %r10
+        sub     $9, %cl
+        movq    %r11, %r9
+        andq    %r10, %r9
+        shrq    %cl, %r9
+        movq    PTR(VA_PMD_0)(%rsi), %r8
+        addq    %r8, %r9
+        movq    PTR(PA_PTE_0)(%rsi), %r8
+        orq     $PAGE_ATTR, %r8
+        movq    %r8, (%r9)
+        shrq    $9, %r10
+        sub     $9, %cl
+        movq    %r11, %r9
+        andq    %r10, %r9
+        shrq    %cl, %r9
+        movq    PTR(VA_PTE_0)(%rsi), %r8
+        addq    %r8, %r9
+        movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
+        orq     $PAGE_ATTR, %r8
+        movq    %r8, (%r9)
+        /* identity map the control page at its physical address */
+        movq    $0x0000ff8000000000, %r10        /* mask */
+        mov     $(39 - 3), %cl                   /* bits to shift */
+        movq    PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
+        movq    %r11, %r9
+        andq    %r10, %r9
+        shrq    %cl, %r9
+        movq    PTR(VA_PGD)(%rsi), %r8
+        addq    %r8, %r9
+        movq    PTR(PA_PUD_1)(%rsi), %r8
+        orq     $PAGE_ATTR, %r8
+        movq    %r8, (%r9)
+        shrq    $9, %r10
+        sub     $9, %cl
+        movq    %r11, %r9
+        andq    %r10, %r9
+        shrq    %cl, %r9
+        movq    PTR(VA_PUD_1)(%rsi), %r8
+        addq    %r8, %r9
+        movq    PTR(PA_PMD_1)(%rsi), %r8
+        orq     $PAGE_ATTR, %r8
+        movq    %r8, (%r9)
+        shrq    $9, %r10
+        sub     $9, %cl
+        movq    %r11, %r9
+        andq    %r10, %r9
+        shrq    %cl, %r9
+        movq    PTR(VA_PMD_1)(%rsi), %r8
+        addq    %r8, %r9
+        movq    PTR(PA_PTE_1)(%rsi), %r8
+        orq     $PAGE_ATTR, %r8
+        movq    %r8, (%r9)
+        shrq    $9, %r10
+        sub     $9, %cl
+        movq    %r11, %r9
+        andq    %r10, %r9
+        shrq    %cl, %r9
+        movq    PTR(VA_PTE_1)(%rsi), %r8
+        addq    %r8, %r9
+        movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
+        orq     $PAGE_ATTR, %r8
+        movq    %r8, (%r9)
+relocate_new_kernel:
+        /* %rdi indirection_page
+         * %rsi page_list
+         * %rdx start address
+         */
+        /* zero out flags, and disable interrupts */
+        pushq $0
+        popfq
+        /* get physical address of control page now */
+        /* this is impossible after page table switch */
+        movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
+        /* get physical address of page table now too */
+        movq    PTR(PA_TABLE_PAGE)(%rsi), %rcx
+        /* switch to new set of page tables */
+        movq    PTR(PA_PGD)(%rsi), %r9
+        movq    %r9, %cr3
+        /* setup a new stack at the end of the physical control page */
+        lea     4096(%r8), %rsp
+        /* jump to identity mapped page */
+        addq    $(identity_mapped - relocate_kernel), %r8
+        pushq   %r8
+        ret
+identity_mapped:
+        /* store the start address on the stack */
+        pushq   %rdx
+        /* Set cr0 to a known state:
+         * 31 1 == Paging enabled
+         * 18 0 == Alignment check disabled
+         * 16 0 == Write protect disabled
+         * 3  0 == No task switch
+         * 2  0 == Don't do FP software emulation.
+         * 0  1 == Proctected mode enabled
+         */
+        movq    %cr0, %rax
+        andq    $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
+        orl     $((1<<31)|(1<<0)), %eax
+        movq    %rax, %cr0
+        /* Set cr4 to a known state:
+         * 10 0 == xmm exceptions disabled
+         * 9  0 == xmm registers instructions disabled
+         * 8  0 == performance monitoring counter disabled
+         * 7  0 == page global disabled
+         * 6  0 == machine check exceptions disabled
+         * 5  1 == physical address extension enabled
+         * 4  0 == page size extensions disabled
+         * 3  0 == Debug extensions disabled
+         * 2  0 == Time stamp disable (disabled)
+         * 1  0 == Protected mode virtual interrupts disabled
+         * 0  0 == VME disabled
+         */
+        movq    $((1<<5)), %rax
+        movq    %rax, %cr4
+        jmp 1f
+1:
+        /* Switch to the identity mapped page tables,
+         * and flush the TLB.
+        */
+        movq    %rcx, %cr3
+        /* Do the copies */
+        movq    %rdi, %rcx      /* Put the page_list in %rcx */
+        xorq    %rdi, %rdi
+        xorq    %rsi, %rsi
+        jmp     1f
+0:      /* top, read another word for the indirection page */
+        movq    (%rbx), %rcx
+        addq    $8,     %rbx
+1:
+        testq   $0x1,   %rcx  /* is it a destination page? */
+        jz      2f
+        movq    %rcx,   %rdi
+        andq    $0xfffffffffffff000, %rdi
+        jmp     0b
+2:
+        testq   $0x2,   %rcx  /* is it an indirection page? */
+        jz      2f
+        movq    %rcx,   %rbx
+        andq    $0xfffffffffffff000, %rbx
+        jmp     0b
+2:
+        testq   $0x4,   %rcx  /* is it the done indicator? */
+        jz      2f
+        jmp     3f
+2:
+        testq   $0x8,   %rcx  /* is it the source indicator? */
+        jz      0b            /* Ignore it otherwise */
+        movq    %rcx,   %rsi  /* For ever source page do a copy */
+        andq    $0xfffffffffffff000, %rsi
+        movq    $512,   %rcx
+        rep ; movsq
+        jmp     0b
+3:
+        /* To be certain of avoiding problems with self-modifying code
+         * I need to execute a serializing instruction here.
+         * So I flush the TLB by reloading %cr3 here, it's handy,
+         * and not processor dependent.
+         */
+        movq    %cr3, %rax
+        movq    %rax, %cr3
+        /* set all of the registers to known values */
+        /* leave %rsp alone */
+        xorq    %rax, %rax
+        xorq    %rbx, %rbx
+        xorq    %rcx, %rcx
+        xorq    %rdx, %rdx
+        xorq    %rsi, %rsi
+        xorq    %rdi, %rdi
+        xorq    %rbp, %rbp
+        xorq    %r8,  %r8
+        xorq    %r9,  %r9
+        xorq    %r10, %r9
+        xorq    %r11, %r11
+        xorq    %r12, %r12
+        xorq    %r13, %r13
+        xorq    %r14, %r14
+        xorq    %r15, %r15
+        ret
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c
new file mode 100644
index 000000000000..c7d3df23f589
--- /dev/null
+++ b/arch/x86/kernel/scx200_32.c
@@ -0,0 +1,131 @@
+/* linux/arch/i386/kernel/scx200.c 
+   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
+   National Semiconductor SCx200 support. */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/scx200.h>
+#include <linux/scx200_gpio.h>
+/* Verify that the configuration block really is there */
+#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
+#define NAME "scx200"
+MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
+MODULE_DESCRIPTION("NatSemi SCx200 Driver");
+MODULE_LICENSE("GPL");
+unsigned scx200_gpio_base = 0;
+long scx200_gpio_shadow[2];
+unsigned scx200_cb_base = 0;
+static struct pci_device_id scx200_tbl[] = {
+        { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
+        { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
+        { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS)   },
+        { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS)   },
+        { },
+};
+MODULE_DEVICE_TABLE(pci,scx200_tbl);
+static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *);
+static struct pci_driver scx200_pci_driver = {
+        .name = "scx200",
+        .id_table = scx200_tbl,
+        .probe = scx200_probe,
+};
+static DEFINE_MUTEX(scx200_gpio_config_lock);
+static void __devinit scx200_init_shadow(void)
+{
+        int bank;
+        /* read the current values driven on the GPIO signals */
+        for (bank = 0; bank < 2; ++bank)
+                scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
+}
+static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+        unsigned base;
+        if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
+            pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
+                base = pci_resource_start(pdev, 0);
+                printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
+                if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) {
+                        printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
+                        return -EBUSY;
+                }
+                scx200_gpio_base = base;
+                scx200_init_shadow();
+        } else {
+                /* find the base of the Configuration Block */
+                if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
+                        scx200_cb_base = SCx200_CB_BASE_FIXED;
+                } else {
+                        pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
+                        if (scx200_cb_probe(base)) {
+                                scx200_cb_base = base;
+                        } else {
+                                printk(KERN_WARNING NAME ": Configuration Block not found\n");
+                                return -ENODEV;
+                        }
+                }
+                printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
+        }
+        return 0;
+}
+u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
+{
+        u32 config, new_config;
+        mutex_lock(&scx200_gpio_config_lock);
+        outl(index, scx200_gpio_base + 0x20);
+        config = inl(scx200_gpio_base + 0x24);
+        new_config = (config & mask) | bits;
+        outl(new_config, scx200_gpio_base + 0x24);
+        mutex_unlock(&scx200_gpio_config_lock);
+        return config;
+}
+static int __init scx200_init(void)
+{
+        printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
+        return pci_register_driver(&scx200_pci_driver);
+}
+static void __exit scx200_cleanup(void)
+{
+        pci_unregister_driver(&scx200_pci_driver);
+        release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
+}
+module_init(scx200_init);
+module_exit(scx200_cleanup);
+EXPORT_SYMBOL(scx200_gpio_base);
+EXPORT_SYMBOL(scx200_gpio_shadow);
+EXPORT_SYMBOL(scx200_gpio_configure);
+EXPORT_SYMBOL(scx200_cb_base);
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
new file mode 100644
index 000000000000..1200aaac403e
--- /dev/null
+++ b/arch/x86/kernel/setup64.c
@@ -0,0 +1,289 @@
+/* 
+ * X86-64 specific CPU setup.
+ * Copyright (C) 1995  Linus Torvalds
+ * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
+ * See setup.c for older changelog.
+ */ 
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <asm/bootsetup.h>
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/mmu_context.h>
+#include <asm/smp.h>
+#include <asm/i387.h>
+#include <asm/percpu.h>
+#include <asm/proto.h>
+#include <asm/sections.h>
+char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
+struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
+struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
+struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
+char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
+unsigned long __supported_pte_mask __read_mostly = ~0UL;
+static int do_not_nx __cpuinitdata = 0;
+/* noexec=on|off
+Control non executable mappings for 64bit processes.
+on      Enable(default)
+off     Disable
+*/ 
+static int __init nonx_setup(char *str)
+{
+        if (!str)
+                return -EINVAL;
+        if (!strncmp(str, "on", 2)) {
+                __supported_pte_mask |= _PAGE_NX; 
+                do_not_nx = 0; 
+        } else if (!strncmp(str, "off", 3)) {
+                do_not_nx = 1;
+                __supported_pte_mask &= ~_PAGE_NX;
+        }
+        return 0;
+} 
+early_param("noexec", nonx_setup);
+int force_personality32 = 0; 
+/* noexec32=on|off
+Control non executable heap for 32bit processes.
+To control the stack too use noexec=off
+on      PROT_READ does not imply PROT_EXEC for 32bit processes
+off     PROT_READ implies PROT_EXEC (default)
+*/
+static int __init nonx32_setup(char *str)
+{
+        if (!strcmp(str, "on"))
+                force_personality32 &= ~READ_IMPLIES_EXEC;
+        else if (!strcmp(str, "off"))
+                force_personality32 |= READ_IMPLIES_EXEC;
+        return 1;
+}
+__setup("noexec32=", nonx32_setup);
+/*
+ * Great future plan:
+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
+ * Always point %gs to its beginning
+ */
+void __init setup_per_cpu_areas(void)
+{ 
+        int i;
+        unsigned long size;
+#ifdef CONFIG_HOTPLUG_CPU
+        prefill_possible_map();
+#endif
+        /* Copy section for each CPU (we discard the original) */
+        size = PERCPU_ENOUGH_ROOM;
+        printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
+        for_each_cpu_mask (i, cpu_possible_map) {
+                char *ptr;
+                if (!NODE_DATA(cpu_to_node(i))) {
+                        printk("cpu with no node %d, num_online_nodes %d\n",
+                               i, num_online_nodes());
+                        ptr = alloc_bootmem_pages(size);
+                } else { 
+                        ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
+                }
+                if (!ptr)
+                        panic("Cannot allocate cpu data for CPU %d\n", i);
+                cpu_pda(i)->data_offset = ptr - __per_cpu_start;
+                memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+        }
+} 
+void pda_init(int cpu)
+{ 
+        struct x8664_pda *pda = cpu_pda(cpu);
+        /* Setup up data that may be needed in __get_free_pages early */
+        asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
+        /* Memory clobbers used to order PDA accessed */
+        mb();
+        wrmsrl(MSR_GS_BASE, pda);
+        mb();
+        pda->cpunumber = cpu; 
+        pda->irqcount = -1;
+        pda->kernelstack = 
+                (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 
+        pda->active_mm = &init_mm;
+        pda->mmu_state = 0;
+        if (cpu == 0) {
+                /* others are initialized in smpboot.c */
+                pda->pcurrent = &init_task;
+                pda->irqstackptr = boot_cpu_stack; 
+        } else {
+                pda->irqstackptr = (char *)
+                        __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
+                if (!pda->irqstackptr)
+                        panic("cannot allocate irqstack for cpu %d", cpu); 
+        }
+        pda->irqstackptr += IRQSTACKSIZE-64;
+} 
+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
+__attribute__((section(".bss.page_aligned")));
+extern asmlinkage void ignore_sysret(void);
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+        /* 
+         * LSTAR and STAR live in a bit strange symbiosis.
+         * They both write to the same internal register. STAR allows to set CS/DS
+         * but only a 32bit target. LSTAR sets the 64bit rip.    
+         */ 
+        wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); 
+        wrmsrl(MSR_LSTAR, system_call); 
+        wrmsrl(MSR_CSTAR, ignore_sysret);
+#ifdef CONFIG_IA32_EMULATION            
+        syscall32_cpu_init ();
+#endif
+        /* Flags to clear on syscall */
+        wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
+}
+void __cpuinit check_efer(void)
+{
+        unsigned long efer;
+        rdmsrl(MSR_EFER, efer); 
+        if (!(efer & EFER_NX) || do_not_nx) { 
+                __supported_pte_mask &= ~_PAGE_NX; 
+        }       
+}
+unsigned long kernel_eflags;
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init.
+ */
+void __cpuinit cpu_init (void)
+{
+        int cpu = stack_smp_processor_id();
+        struct tss_struct *t = &per_cpu(init_tss, cpu);
+        struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
+        unsigned long v; 
+        char *estacks = NULL; 
+        struct task_struct *me;
+        int i;
+        /* CPU 0 is initialised in head64.c */
+        if (cpu != 0) {
+                pda_init(cpu);
+        } else 
+                estacks = boot_exception_stacks; 
+        me = current;
+        if (cpu_test_and_set(cpu, cpu_initialized))
+                panic("CPU#%d already initialized!\n", cpu);
+        printk("Initializing CPU#%d\n", cpu);
+        clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+        /*
+         * Initialize the per-CPU GDT with the boot GDT,
+         * and set up the GDT descriptor:
+         */
+        if (cpu)
+                memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
+        cpu_gdt_descr[cpu].size = GDT_SIZE;
+        asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
+        asm volatile("lidt %0" :: "m" (idt_descr));
+        memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+        syscall_init();
+        wrmsrl(MSR_FS_BASE, 0);
+        wrmsrl(MSR_KERNEL_GS_BASE, 0);
+        barrier(); 
+        check_efer();
+        /*
+         * set up and load the per-CPU TSS
+         */
+        for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+                static const unsigned int order[N_EXCEPTION_STACKS] = {
+                        [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+                        [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+                };
+                if (cpu) {
+                        estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
+                        if (!estacks)
+                                panic("Cannot allocate exception stack %ld %d\n",
+                                      v, cpu); 
+                }
+                estacks += PAGE_SIZE << order[v];
+                orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
+        }
+        t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+        /*
+         * <= is required because the CPU will access up to
+         * 8 bits beyond the end of the IO permission bitmap.
+         */
+        for (i = 0; i <= IO_BITMAP_LONGS; i++)
+                t->io_bitmap[i] = ~0UL;
+        atomic_inc(&init_mm.mm_count);
+        me->active_mm = &init_mm;
+        if (me->mm)
+                BUG();
+        enter_lazy_tlb(&init_mm, me);
+        set_tss_desc(cpu, t);
+        load_TR_desc();
+        load_LDT(&init_mm.context);
+        /*
+         * Clear all 6 debug registers:
+         */
+        set_debugreg(0UL, 0);
+        set_debugreg(0UL, 1);
+        set_debugreg(0UL, 2);
+        set_debugreg(0UL, 3);
+        set_debugreg(0UL, 6);
+        set_debugreg(0UL, 7);
+        fpu_init(); 
+        raw_local_save_flags(kernel_eflags);
+}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
new file mode 100644
index 000000000000..d474cd639bcb
--- /dev/null
+++ b/arch/x86/kernel/setup_32.c
@@ -0,0 +1,653 @@
+/*
+ *  linux/arch/i386/kernel/setup.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ *  Memory region support
+ *      David Parsons <orc@pell.chi.il.us>, July-August 1999
+ *
+ *  Added E820 sanitization routine (removes overlapping memory regions);
+ *  Brian Moyle <bmoyle@mvista.com>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ *    Patrick Mochel <mochel@osdl.org>, March 2002
+ *
+ *  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *  Alex Achenbach <xela@slit.de>, December 2002.
+ *
+ */
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/apm_bios.h>
+#include <linux/initrd.h>
+#include <linux/bootmem.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/mca.h>
+#include <linux/root_dev.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/nodemask.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <video/edid.h>
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/mpspec.h>
+#include <asm/mmzone.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/sections.h>
+#include <asm/io_apic.h>
+#include <asm/ist.h>
+#include <asm/io.h>
+#include <asm/vmi.h>
+#include <setup_arch.h>
+#include <bios_ebda.h>
+/* This value is set up by the early boot code to point to the value
+   immediately after the boot time page tables.  It contains a *physical*
+   address, and must not be in the .bss segment! */
+unsigned long init_pg_tables_end __initdata = ~0UL;
+int disable_pse __devinitdata = 0;
+/*
+ * Machine setup..
+ */
+extern struct resource code_resource;
+extern struct resource data_resource;
+/* cpu data as detected by the assembly code in head.S */
+struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+/* common cpu data for all cpus */
+struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+EXPORT_SYMBOL(boot_cpu_data);
+unsigned long mmu_cr4_features;
+/* for MCA, but anyone else can use it if they want */
+unsigned int machine_id;
+#ifdef CONFIG_MCA
+EXPORT_SYMBOL(machine_id);
+#endif
+unsigned int machine_submodel_id;
+unsigned int BIOS_revision;
+unsigned int mca_pentium_flag;
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+struct ist_info ist_info;
+#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
+        defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+EXPORT_SYMBOL(ist_info);
+#endif
+extern void early_cpu_init(void);
+extern int root_mountflags;
+unsigned long saved_videomode;
+#define RAMDISK_IMAGE_START_MASK        0x07FF
+#define RAMDISK_PROMPT_FLAG             0x8000
+#define RAMDISK_LOAD_FLAG               0x4000  
+static char __initdata command_line[COMMAND_LINE_SIZE];
+struct boot_params __initdata boot_params;
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ *              from boot_params into a safe place.
+ *
+ */
+static inline void copy_edd(void)
+{
+     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
+     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
+     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
+     edd.edd_info_nr = EDD_NR;
+}
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+int __initdata user_defined_memmap = 0;
+/*
+ * "mem=nopentium" disables the 4MB page tables.
+ * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
+ * to <mem>, overriding the bios size.
+ * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
+ * <start> to <start>+<mem>, overriding the bios size.
+ *
+ * HPA tells me bootloaders need to parse mem=, so no new
+ * option should be mem=  [also see Documentation/i386/boot.txt]
+ */
+static int __init parse_mem(char *arg)
+{
+        if (!arg)
+                return -EINVAL;
+        if (strcmp(arg, "nopentium") == 0) {
+                clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+                disable_pse = 1;
+        } else {
+                /* If the user specifies memory size, we
+                 * limit the BIOS-provided memory map to
+                 * that size. exactmap can be used to specify
+                 * the exact map. mem=number can be used to
+                 * trim the existing memory map.
+                 */
+                unsigned long long mem_size;
+ 
+                mem_size = memparse(arg, &arg);
+                limit_regions(mem_size);
+                user_defined_memmap = 1;
+        }
+        return 0;
+}
+early_param("mem", parse_mem);
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel.
+ */
+static int __init parse_elfcorehdr(char *arg)
+{
+        if (!arg)
+                return -EINVAL;
+        elfcorehdr_addr = memparse(arg, &arg);
+        return 0;
+}
+early_param("elfcorehdr", parse_elfcorehdr);
+#endif /* CONFIG_PROC_VMCORE */
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+        if (!arg)
+                return -EINVAL;
+        highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+        return 0;
+}
+early_param("highmem", parse_highmem);
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+        if (!arg)
+                return -EINVAL;
+        __VMALLOC_RESERVE = memparse(arg, &arg);
+        return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+        unsigned long address;
+        if (!arg)
+                return -EINVAL;
+        address = memparse(arg, &arg);
+        reserve_top_address(address);
+        return 0;
+}
+early_param("reservetop", parse_reservetop);
+/*
+ * Determine low and high memory ranges:
+ */
+unsigned long __init find_max_low_pfn(void)
+{
+        unsigned long max_low_pfn;
+        max_low_pfn = max_pfn;
+        if (max_low_pfn > MAXMEM_PFN) {
+                if (highmem_pages == -1)
+                        highmem_pages = max_pfn - MAXMEM_PFN;
+                if (highmem_pages + MAXMEM_PFN < max_pfn)
+                        max_pfn = MAXMEM_PFN + highmem_pages;
+                if (highmem_pages + MAXMEM_PFN > max_pfn) {
+                        printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
+                        highmem_pages = 0;
+                }
+                max_low_pfn = MAXMEM_PFN;
+#ifndef CONFIG_HIGHMEM
+                /* Maximum memory usable is what is directly addressable */
+                printk(KERN_WARNING "Warning only %ldMB will be used.\n",
+                                        MAXMEM>>20);
+                if (max_pfn > MAX_NONPAE_PFN)
+                        printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+                else
+                        printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+                max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+                if (max_pfn > MAX_NONPAE_PFN) {
+                        max_pfn = MAX_NONPAE_PFN;
+                        printk(KERN_WARNING "Warning only 4GB will be used.\n");
+                        printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+                }
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+        } else {
+                if (highmem_pages == -1)
+                        highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+                if (highmem_pages >= max_pfn) {
+                        printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
+                        highmem_pages = 0;
+                }
+                if (highmem_pages) {
+                        if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
+                                printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
+                                highmem_pages = 0;
+                        }
+                        max_low_pfn -= highmem_pages;
+                }
+#else
+                if (highmem_pages)
+                        printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+        }
+        return max_low_pfn;
+}
+/*
+ * workaround for Dell systems that neglect to reserve EBDA
+ */
+static void __init reserve_ebda_region(void)
+{
+        unsigned int addr;
+        addr = get_bios_ebda();
+        if (addr)
+                reserve_bootmem(addr, PAGE_SIZE);       
+}
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init setup_bootmem_allocator(void);
+static unsigned long __init setup_memory(void)
+{
+        /*
+         * partially used pages are not usable - thus
+         * we are rounding upwards:
+         */
+        min_low_pfn = PFN_UP(init_pg_tables_end);
+        find_max_pfn();
+        max_low_pfn = find_max_low_pfn();
+#ifdef CONFIG_HIGHMEM
+        highstart_pfn = highend_pfn = max_pfn;
+        if (max_pfn > max_low_pfn) {
+                highstart_pfn = max_low_pfn;
+        }
+        printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+                pages_to_mb(highend_pfn - highstart_pfn));
+        num_physpages = highend_pfn;
+        high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+        num_physpages = max_low_pfn;
+        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+        max_mapnr = num_physpages;
+#endif
+        printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+                        pages_to_mb(max_low_pfn));
+        setup_bootmem_allocator();
+        return max_low_pfn;
+}
+void __init zone_sizes_init(void)
+{
+        unsigned long max_zone_pfns[MAX_NR_ZONES];
+        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+        max_zone_pfns[ZONE_DMA] =
+                virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+        max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+        max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+        add_active_range(0, 0, highend_pfn);
+#else
+        add_active_range(0, 0, max_low_pfn);
+#endif
+        free_area_init_nodes(max_zone_pfns);
+}
+#else
+extern unsigned long __init setup_memory(void);
+extern void zone_sizes_init(void);
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+void __init setup_bootmem_allocator(void)
+{
+        unsigned long bootmap_size;
+        /*
+         * Initialize the boot-time allocator (with low memory only):
+         */
+        bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
+        register_bootmem_low_pages(max_low_pfn);
+        /*
+         * Reserve the bootmem bitmap itself as well. We do this in two
+         * steps (first step was init_bootmem()) because this catches
+         * the (very unlikely) case of us accidentally initializing the
+         * bootmem allocator with an invalid RAM area.
+         */
+        reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
+                         bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
+        /*
+         * reserve physical page 0 - it's a special BIOS page on many boxes,
+         * enabling clean reboots, SMP operation, laptop functions.
+         */
+        reserve_bootmem(0, PAGE_SIZE);
+        /* reserve EBDA region, it's a 4K region */
+        reserve_ebda_region();
+    /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
+       PCI prefetch into it (errata #56). Usually the page is reserved anyways,
+       unless you have no PS/2 mouse plugged in. */
+        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+            boot_cpu_data.x86 == 6)
+             reserve_bootmem(0xa0000 - 4096, 4096);
+#ifdef CONFIG_SMP
+        /*
+         * But first pinch a few for the stack/trampoline stuff
+         * FIXME: Don't need the extra page at 4K, but need to fix
+         * trampoline before removing it. (see the GDT stuff)
+         */
+        reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
+#endif
+#ifdef CONFIG_ACPI_SLEEP
+        /*
+         * Reserve low memory region for sleep support.
+         */
+        acpi_reserve_bootmem();
+#endif
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+        /*
+         * Find and reserve possible boot-time SMP configuration:
+         */
+        find_smp_config();
+#endif
+        numa_kva_reserve();
+#ifdef CONFIG_BLK_DEV_INITRD
+        if (LOADER_TYPE && INITRD_START) {
+                if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
+                        reserve_bootmem(INITRD_START, INITRD_SIZE);
+                        initrd_start = INITRD_START + PAGE_OFFSET;
+                        initrd_end = initrd_start+INITRD_SIZE;
+                }
+                else {
+                        printk(KERN_ERR "initrd extends beyond end of memory "
+                            "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+                            INITRD_START + INITRD_SIZE,
+                            max_low_pfn << PAGE_SHIFT);
+                        initrd_start = 0;
+                }
+        }
+#endif
+#ifdef CONFIG_KEXEC
+        if (crashk_res.start != crashk_res.end)
+                reserve_bootmem(crashk_res.start,
+                        crashk_res.end - crashk_res.start + 1);
+#endif
+}
+/*
+ * The node 0 pgdat is initialized before all of these because
+ * it's needed for bootmem.  node>0 pgdats have their virtual
+ * space allocated before the pagetables are in place to access
+ * them, so they can't be cleared then.
+ *
+ * This should all compile down to nothing when NUMA is off.
+ */
+static void __init remapped_pgdat_init(void)
+{
+        int nid;
+        for_each_online_node(nid) {
+                if (nid != 0)
+                        memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+        }
+}
+#ifdef CONFIG_MCA
+static void set_mca_bus(int x)
+{
+        MCA_bus = x;
+}
+#else
+static void set_mca_bus(int x) { }
+#endif
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+char * __init __attribute__((weak)) memory_setup(void)
+{
+        return machine_specific_memory_setup();
+}
+/*
+ * Determine if we were loaded by an EFI loader.  If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization.  Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
+ */
+void __init setup_arch(char **cmdline_p)
+{
+        unsigned long max_low_pfn;
+        memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+        pre_setup_arch_hook();
+        early_cpu_init();
+        /*
+         * FIXME: This isn't an official loader_type right
+         * now but does currently work with elilo.
+         * If we were configured as an EFI kernel, check to make
+         * sure that we were loaded correctly from elilo and that
+         * the system table is valid.  If not, then initialize normally.
+         */
+#ifdef CONFIG_EFI
+        if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
+                efi_enabled = 1;
+#endif
+        ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
+        screen_info = SCREEN_INFO;
+        edid_info = EDID_INFO;
+        apm_info.bios = APM_BIOS_INFO;
+        ist_info = IST_INFO;
+        saved_videomode = VIDEO_MODE;
+        if( SYS_DESC_TABLE.length != 0 ) {
+                set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
+                machine_id = SYS_DESC_TABLE.table[0];
+                machine_submodel_id = SYS_DESC_TABLE.table[1];
+                BIOS_revision = SYS_DESC_TABLE.table[2];
+        }
+        bootloader_type = LOADER_TYPE;
+#ifdef CONFIG_BLK_DEV_RAM
+        rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
+        rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
+        rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
+#endif
+        ARCH_SETUP
+        if (efi_enabled)
+                efi_init();
+        else {
+                printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+                print_memory_map(memory_setup());
+        }
+        copy_edd();
+        if (!MOUNT_ROOT_RDONLY)
+                root_mountflags &= ~MS_RDONLY;
+        init_mm.start_code = (unsigned long) _text;
+        init_mm.end_code = (unsigned long) _etext;
+        init_mm.end_data = (unsigned long) _edata;
+        init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
+        code_resource.start = virt_to_phys(_text);
+        code_resource.end = virt_to_phys(_etext)-1;
+        data_resource.start = virt_to_phys(_etext);
+        data_resource.end = virt_to_phys(_edata)-1;
+        parse_early_param();
+        if (user_defined_memmap) {
+                printk(KERN_INFO "user-defined physical RAM map:\n");
+                print_memory_map("user");
+        }
+        strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+        *cmdline_p = command_line;
+        max_low_pfn = setup_memory();
+#ifdef CONFIG_VMI
+        /*
+         * Must be after max_low_pfn is determined, and before kernel
+         * pagetables are setup.
+         */
+        vmi_init();
+#endif
+        /*
+         * NOTE: before this point _nobody_ is allowed to allocate
+         * any memory using the bootmem allocator.  Although the
+         * alloctor is now initialised only the first 8Mb of the kernel
+         * virtual address space has been mapped.  All allocations before
+         * paging_init() has completed must use the alloc_bootmem_low_pages()
+         * variant (which allocates DMA'able memory) and care must be taken
+         * not to exceed the 8Mb limit.
+         */
+#ifdef CONFIG_SMP
+        smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
+#endif
+        paging_init();
+        remapped_pgdat_init();
+        sparse_init();
+        zone_sizes_init();
+        /*
+         * NOTE: at this point the bootmem allocator is fully available.
+         */
+        paravirt_post_allocator_init();
+        dmi_scan_machine();
+#ifdef CONFIG_X86_GENERICARCH
+        generic_apic_probe();
+#endif  
+        if (efi_enabled)
+                efi_map_memmap();
+#ifdef CONFIG_ACPI
+        /*
+         * Parse the ACPI tables for possible boot-time SMP configuration.
+         */
+        acpi_boot_table_init();
+#endif
+#ifdef CONFIG_PCI
+#ifdef CONFIG_X86_IO_APIC
+        check_acpi_pci();       /* Checks more than just ACPI actually */
+#endif
+#endif
+#ifdef CONFIG_ACPI
+        acpi_boot_init();
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
+        if (def_to_bigsmp)
+                printk(KERN_WARNING "More than 8 CPUs detected and "
+                        "CONFIG_X86_PC cannot handle it.\nUse "
+                        "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
+#endif
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+        if (smp_found_config)
+                get_smp_config();
+#endif
+        e820_register_memory();
+        e820_mark_nosave_regions();
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+        if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+                conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+        conswitchp = &dummy_con;
+#endif
+#endif
+}
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
new file mode 100644
index 000000000000..af838f6b0b7f
--- /dev/null
+++ b/arch/x86/kernel/setup_64.c
@@ -0,0 +1,1117 @@
+/*
+ *  linux/arch/x86-64/kernel/setup.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Nov 2001 Dave Jones <davej@suse.de>
+ *  Forked from i386 setup code.
+ */
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <linux/console.h>
+#include <linux/seq_file.h>
+#include <linux/crash_dump.h>
+#include <linux/root_dev.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/kallsyms.h>
+#include <linux/edd.h>
+#include <linux/mmzone.h>
+#include <linux/kexec.h>
+#include <linux/cpufreq.h>
+#include <linux/dmi.h>
+#include <linux/dma-mapping.h>
+#include <linux/ctype.h>
+#include <asm/mtrr.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/msr.h>
+#include <asm/desc.h>
+#include <video/edid.h>
+#include <asm/e820.h>
+#include <asm/dma.h>
+#include <asm/mpspec.h>
+#include <asm/mmu_context.h>
+#include <asm/bootsetup.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/mach_apic.h>
+#include <asm/numa.h>
+#include <asm/sections.h>
+#include <asm/dmi.h>
+/*
+ * Machine setup..
+ */
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+unsigned long mmu_cr4_features;
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+unsigned long saved_video_mode;
+int force_mwait __cpuinitdata;
+/* 
+ * Early DMI memory
+ */
+int dmi_alloc_index;
+char dmi_alloc_data[DMI_MAX_DATA];
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct sys_desc_table_struct {
+        unsigned short length;
+        unsigned char table[0];
+};
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+extern int root_mountflags;
+char __initdata command_line[COMMAND_LINE_SIZE];
+struct resource standard_io_resources[] = {
+        { .name = "dma1", .start = 0x00, .end = 0x1f,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "pic1", .start = 0x20, .end = 0x21,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "timer0", .start = 0x40, .end = 0x43,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "timer1", .start = 0x50, .end = 0x53,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "keyboard", .start = 0x60, .end = 0x6f,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "dma page reg", .start = 0x80, .end = 0x8f,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "pic2", .start = 0xa0, .end = 0xa1,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "dma2", .start = 0xc0, .end = 0xdf,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "fpu", .start = 0xf0, .end = 0xff,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
+struct resource data_resource = {
+        .name = "Kernel data",
+        .start = 0,
+        .end = 0,
+        .flags = IORESOURCE_RAM,
+};
+struct resource code_resource = {
+        .name = "Kernel code",
+        .start = 0,
+        .end = 0,
+        .flags = IORESOURCE_RAM,
+};
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+        char *end;
+        if (!arg)
+                return -EINVAL;
+        elfcorehdr_addr = memparse(arg, &end);
+        return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
+#endif
+#ifndef CONFIG_NUMA
+static void __init
+contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long bootmap_size, bootmap;
+        bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+        bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
+        if (bootmap == -1L)
+                panic("Cannot find bootmem map of size %ld\n",bootmap_size);
+        bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
+        e820_register_active_regions(0, start_pfn, end_pfn);
+        free_bootmem_with_active_regions(0, end_pfn);
+        reserve_bootmem(bootmap, bootmap_size);
+} 
+#endif
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ *              from boot_params into a safe place.
+ *
+ */
+static inline void copy_edd(void)
+{
+     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
+     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
+     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
+     edd.edd_info_nr = EDD_NR;
+}
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+#define EBDA_ADDR_POINTER 0x40E
+unsigned __initdata ebda_addr;
+unsigned __initdata ebda_size;
+static void discover_ebda(void)
+{
+        /*
+         * there is a real-mode segmented pointer pointing to the 
+         * 4K EBDA area at 0x40E
+         */
+        ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
+        ebda_addr <<= 4;
+        ebda_size = *(unsigned short *)__va(ebda_addr);
+        /* Round EBDA up to pages */
+        if (ebda_size == 0)
+                ebda_size = 1;
+        ebda_size <<= 10;
+        ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
+        if (ebda_size > 64*1024)
+                ebda_size = 64*1024;
+}
+void __init setup_arch(char **cmdline_p)
+{
+        printk(KERN_INFO "Command line: %s\n", boot_command_line);
+        ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
+        screen_info = SCREEN_INFO;
+        edid_info = EDID_INFO;
+        saved_video_mode = SAVED_VIDEO_MODE;
+        bootloader_type = LOADER_TYPE;
+#ifdef CONFIG_BLK_DEV_RAM
+        rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
+        rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
+        rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
+#endif
+        setup_memory_region();
+        copy_edd();
+        if (!MOUNT_ROOT_RDONLY)
+                root_mountflags &= ~MS_RDONLY;
+        init_mm.start_code = (unsigned long) &_text;
+        init_mm.end_code = (unsigned long) &_etext;
+        init_mm.end_data = (unsigned long) &_edata;
+        init_mm.brk = (unsigned long) &_end;
+        code_resource.start = virt_to_phys(&_text);
+        code_resource.end = virt_to_phys(&_etext)-1;
+        data_resource.start = virt_to_phys(&_etext);
+        data_resource.end = virt_to_phys(&_edata)-1;
+        early_identify_cpu(&boot_cpu_data);
+        strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+        *cmdline_p = command_line;
+        parse_early_param();
+        finish_e820_parsing();
+        e820_register_active_regions(0, 0, -1UL);
+        /*
+         * partially used pages are not usable - thus
+         * we are rounding upwards:
+         */
+        end_pfn = e820_end_of_ram();
+        num_physpages = end_pfn;
+        check_efer();
+        discover_ebda();
+        init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
+        dmi_scan_machine();
+#ifdef CONFIG_ACPI
+        /*
+         * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
+         * Call this early for SRAT node setup.
+         */
+        acpi_boot_table_init();
+#endif
+        /* How many end-of-memory variables you have, grandma! */
+        max_low_pfn = end_pfn;
+        max_pfn = end_pfn;
+        high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
+        /* Remove active ranges so rediscovery with NUMA-awareness happens */
+        remove_all_active_ranges();
+#ifdef CONFIG_ACPI_NUMA
+        /*
+         * Parse SRAT to discover nodes.
+         */
+        acpi_numa_init();
+#endif
+#ifdef CONFIG_NUMA
+        numa_initmem_init(0, end_pfn); 
+#else
+        contig_initmem_init(0, end_pfn);
+#endif
+        /* Reserve direct mapping */
+        reserve_bootmem_generic(table_start << PAGE_SHIFT, 
+                                (table_end - table_start) << PAGE_SHIFT);
+        /* reserve kernel */
+        reserve_bootmem_generic(__pa_symbol(&_text),
+                                __pa_symbol(&_end) - __pa_symbol(&_text));
+        /*
+         * reserve physical page 0 - it's a special BIOS page on many boxes,
+         * enabling clean reboots, SMP operation, laptop functions.
+         */
+        reserve_bootmem_generic(0, PAGE_SIZE);
+        /* reserve ebda region */
+        if (ebda_addr)
+                reserve_bootmem_generic(ebda_addr, ebda_size);
+#ifdef CONFIG_NUMA
+        /* reserve nodemap region */
+        if (nodemap_addr)
+                reserve_bootmem_generic(nodemap_addr, nodemap_size);
+#endif
+#ifdef CONFIG_SMP
+        /* Reserve SMP trampoline */
+        reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
+#endif
+#ifdef CONFIG_ACPI_SLEEP
+       /*
+        * Reserve low memory region for sleep support.
+        */
+       acpi_reserve_bootmem();
+#endif
+        /*
+         * Find and reserve possible boot-time SMP configuration:
+         */
+        find_smp_config();
+#ifdef CONFIG_BLK_DEV_INITRD
+        if (LOADER_TYPE && INITRD_START) {
+                if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
+                        reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
+                        initrd_start = INITRD_START + PAGE_OFFSET;
+                        initrd_end = initrd_start+INITRD_SIZE;
+                }
+                else {
+                        printk(KERN_ERR "initrd extends beyond end of memory "
+                            "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+                            (unsigned long)(INITRD_START + INITRD_SIZE),
+                            (unsigned long)(end_pfn << PAGE_SHIFT));
+                        initrd_start = 0;
+                }
+        }
+#endif
+#ifdef CONFIG_KEXEC
+        if (crashk_res.start != crashk_res.end) {
+                reserve_bootmem_generic(crashk_res.start,
+                        crashk_res.end - crashk_res.start + 1);
+        }
+#endif
+        paging_init();
+#ifdef CONFIG_PCI
+        early_quirks();
+#endif
+        /*
+         * set this early, so we dont allocate cpu0
+         * if MADT list doesnt list BSP first
+         * mpparse.c/MP_processor_info() allocates logical cpu numbers.
+         */
+        cpu_set(0, cpu_present_map);
+#ifdef CONFIG_ACPI
+        /*
+         * Read APIC and some other early information from ACPI tables.
+         */
+        acpi_boot_init();
+#endif
+        init_cpu_to_node();
+        /*
+         * get boot-time SMP configuration:
+         */
+        if (smp_found_config)
+                get_smp_config();
+        init_apic_mappings();
+        /*
+         * We trust e820 completely. No explicit ROM probing in memory.
+         */
+        e820_reserve_resources(); 
+        e820_mark_nosave_regions();
+        {
+        unsigned i;
+        /* request I/O space for devices used on all i[345]86 PCs */
+        for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+                request_resource(&ioport_resource, &standard_io_resources[i]);
+        }
+        e820_setup_gap();
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+        conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+        conswitchp = &dummy_con;
+#endif
+#endif
+}
+static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+{
+        unsigned int *v;
+        if (c->extended_cpuid_level < 0x80000004)
+                return 0;
+        v = (unsigned int *) c->x86_model_id;
+        cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+        cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+        cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+        c->x86_model_id[48] = 0;
+        return 1;
+}
+static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+{
+        unsigned int n, dummy, eax, ebx, ecx, edx;
+        n = c->extended_cpuid_level;
+        if (n >= 0x80000005) {
+                cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+                printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
+                        edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+                c->x86_cache_size=(ecx>>24)+(edx>>24);
+                /* On K8 L1 TLB is inclusive, so don't count it */
+                c->x86_tlbsize = 0;
+        }
+        if (n >= 0x80000006) {
+                cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+                ecx = cpuid_ecx(0x80000006);
+                c->x86_cache_size = ecx >> 16;
+                c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+                printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+                c->x86_cache_size, ecx & 0xFF);
+        }
+        if (n >= 0x80000007)
+                cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); 
+        if (n >= 0x80000008) {
+                cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 
+                c->x86_virt_bits = (eax >> 8) & 0xff;
+                c->x86_phys_bits = eax & 0xff;
+        }
+}
+#ifdef CONFIG_NUMA
+static int nearby_node(int apicid)
+{
+        int i;
+        for (i = apicid - 1; i >= 0; i--) {
+                int node = apicid_to_node[i];
+                if (node != NUMA_NO_NODE && node_online(node))
+                        return node;
+        }
+        for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+                int node = apicid_to_node[i];
+                if (node != NUMA_NO_NODE && node_online(node))
+                        return node;
+        }
+        return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+/*
+ * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
+ * Assumes number of cores is a power of two.
+ */
+static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+        unsigned bits;
+#ifdef CONFIG_NUMA
+        int cpu = smp_processor_id();
+        int node = 0;
+        unsigned apicid = hard_smp_processor_id();
+#endif
+        unsigned ecx = cpuid_ecx(0x80000008);
+        c->x86_max_cores = (ecx & 0xff) + 1;
+        /* CPU telling us the core id bits shift? */
+        bits = (ecx >> 12) & 0xF;
+        /* Otherwise recompute */
+        if (bits == 0) {
+                while ((1 << bits) < c->x86_max_cores)
+                        bits++;
+        }
+        /* Low order bits define the core id (index of core in socket) */
+        c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
+        /* Convert the APIC ID into the socket ID */
+        c->phys_proc_id = phys_pkg_id(bits);
+#ifdef CONFIG_NUMA
+        node = c->phys_proc_id;
+        if (apicid_to_node[apicid] != NUMA_NO_NODE)
+                node = apicid_to_node[apicid];
+        if (!node_online(node)) {
+                /* Two possibilities here:
+                   - The CPU is missing memory and no node was created.
+                   In that case try picking one from a nearby CPU
+                   - The APIC IDs differ from the HyperTransport node IDs
+                   which the K8 northbridge parsing fills in.
+                   Assume they are all increased by a constant offset,
+                   but in the same order as the HT nodeids.
+                   If that doesn't result in a usable node fall back to the
+                   path for the previous case.  */
+                int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);
+                if (ht_nodeid >= 0 &&
+                    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+                        node = apicid_to_node[ht_nodeid];
+                /* Pick a nearby node */
+                if (!node_online(node))
+                        node = nearby_node(apicid);
+        }
+        numa_set_node(cpu, node);
+        printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+#endif
+}
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+{
+        unsigned level;
+#ifdef CONFIG_SMP
+        unsigned long value;
+        /*
+         * Disable TLB flush filter by setting HWCR.FFDIS on K8
+         * bit 6 of msr C001_0015
+         *
+         * Errata 63 for SH-B3 steppings
+         * Errata 122 for all steppings (F+ have it disabled by default)
+         */
+        if (c->x86 == 15) {
+                rdmsrl(MSR_K8_HWCR, value);
+                value |= 1 << 6;
+                wrmsrl(MSR_K8_HWCR, value);
+        }
+#endif
+        /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+           3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+        clear_bit(0*32+31, &c->x86_capability);
+        
+        /* On C+ stepping K8 rep microcode works well for copy/memset */
+        level = cpuid_eax(1);
+        if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
+                set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+        if (c->x86 == 0x10)
+                set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+        /* Enable workaround for FXSAVE leak */
+        if (c->x86 >= 6)
+                set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
+        level = get_model_name(c);
+        if (!level) {
+                switch (c->x86) { 
+                case 15:
+                        /* Should distinguish Models here, but this is only
+                           a fallback anyways. */
+                        strcpy(c->x86_model_id, "Hammer");
+                        break; 
+                } 
+        } 
+        display_cacheinfo(c);
+        /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+        if (c->x86_power & (1<<8))
+                set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+        /* Multi core CPU? */
+        if (c->extended_cpuid_level >= 0x80000008)
+                amd_detect_cmp(c);
+        if (c->extended_cpuid_level >= 0x80000006 &&
+                (cpuid_edx(0x80000006) & 0xf000))
+                num_cache_leaves = 4;
+        else
+                num_cache_leaves = 3;
+        if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
+                set_bit(X86_FEATURE_K8, &c->x86_capability);
+        /* RDTSC can be speculated around */
+        clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+        /* Family 10 doesn't support C states in MWAIT so don't use it */
+        if (c->x86 == 0x10 && !force_mwait)
+                clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
+}
+static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+        u32     eax, ebx, ecx, edx;
+        int     index_msb, core_bits;
+        cpuid(1, &eax, &ebx, &ecx, &edx);
+        if (!cpu_has(c, X86_FEATURE_HT))
+                return;
+        if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+                goto out;
+        smp_num_siblings = (ebx & 0xff0000) >> 16;
+        if (smp_num_siblings == 1) {
+                printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
+        } else if (smp_num_siblings > 1 ) {
+                if (smp_num_siblings > NR_CPUS) {
+                        printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
+                        smp_num_siblings = 1;
+                        return;
+                }
+                index_msb = get_count_order(smp_num_siblings);
+                c->phys_proc_id = phys_pkg_id(index_msb);
+                smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+                index_msb = get_count_order(smp_num_siblings) ;
+                core_bits = get_count_order(c->x86_max_cores);
+                c->cpu_core_id = phys_pkg_id(index_msb) &
+                                               ((1 << core_bits) - 1);
+        }
+out:
+        if ((c->x86_max_cores * smp_num_siblings) > 1) {
+                printk(KERN_INFO  "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
+                printk(KERN_INFO  "CPU: Processor Core ID: %d\n", c->cpu_core_id);
+        }
+#endif
+}
+/*
+ * find out the number of processor cores on the die
+ */
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
+{
+        unsigned int eax, t;
+        if (c->cpuid_level < 4)
+                return 1;
+        cpuid_count(4, 0, &eax, &t, &t, &t);
+        if (eax & 0x1f)
+                return ((eax >> 26) + 1);
+        else
+                return 1;
+}
+static void srat_detect_node(void)
+{
+#ifdef CONFIG_NUMA
+        unsigned node;
+        int cpu = smp_processor_id();
+        int apicid = hard_smp_processor_id();
+        /* Don't do the funky fallback heuristics the AMD version employs
+           for now. */
+        node = apicid_to_node[apicid];
+        if (node == NUMA_NO_NODE)
+                node = first_node(node_online_map);
+        numa_set_node(cpu, node);
+        printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+}
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+{
+        /* Cache sizes */
+        unsigned n;
+        init_intel_cacheinfo(c);
+        if (c->cpuid_level > 9 ) {
+                unsigned eax = cpuid_eax(10);
+                /* Check for version and the number of counters */
+                if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+                        set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
+        }
+        if (cpu_has_ds) {
+                unsigned int l1, l2;
+                rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+                if (!(l1 & (1<<11)))
+                        set_bit(X86_FEATURE_BTS, c->x86_capability);
+                if (!(l1 & (1<<12)))
+                        set_bit(X86_FEATURE_PEBS, c->x86_capability);
+        }
+        n = c->extended_cpuid_level;
+        if (n >= 0x80000008) {
+                unsigned eax = cpuid_eax(0x80000008);
+                c->x86_virt_bits = (eax >> 8) & 0xff;
+                c->x86_phys_bits = eax & 0xff;
+                /* CPUID workaround for Intel 0F34 CPU */
+                if (c->x86_vendor == X86_VENDOR_INTEL &&
+                    c->x86 == 0xF && c->x86_model == 0x3 &&
+                    c->x86_mask == 0x4)
+                        c->x86_phys_bits = 36;
+        }
+        if (c->x86 == 15)
+                c->x86_cache_alignment = c->x86_clflush_size * 2;
+        if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+            (c->x86 == 0x6 && c->x86_model >= 0x0e))
+                set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+        if (c->x86 == 6)
+                set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+        if (c->x86 == 15)
+                set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+        else
+                clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+        c->x86_max_cores = intel_num_cpu_cores(c);
+        srat_detect_node();
+}
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+{
+        char *v = c->x86_vendor_id;
+        if (!strcmp(v, "AuthenticAMD"))
+                c->x86_vendor = X86_VENDOR_AMD;
+        else if (!strcmp(v, "GenuineIntel"))
+                c->x86_vendor = X86_VENDOR_INTEL;
+        else
+                c->x86_vendor = X86_VENDOR_UNKNOWN;
+}
+struct cpu_model_info {
+        int vendor;
+        int family;
+        char *model_names[16];
+};
+/* Do some early cpuid on the boot CPU to get some parameter that are
+   needed before check_bugs. Everything advanced is in identify_cpu
+   below. */
+void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
+{
+        u32 tfms;
+        c->loops_per_jiffy = loops_per_jiffy;
+        c->x86_cache_size = -1;
+        c->x86_vendor = X86_VENDOR_UNKNOWN;
+        c->x86_model = c->x86_mask = 0; /* So far unknown... */
+        c->x86_vendor_id[0] = '\0'; /* Unset */
+        c->x86_model_id[0] = '\0';  /* Unset */
+        c->x86_clflush_size = 64;
+        c->x86_cache_alignment = c->x86_clflush_size;
+        c->x86_max_cores = 1;
+        c->extended_cpuid_level = 0;
+        memset(&c->x86_capability, 0, sizeof c->x86_capability);
+        /* Get vendor name */
+        cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+              (unsigned int *)&c->x86_vendor_id[0],
+              (unsigned int *)&c->x86_vendor_id[8],
+              (unsigned int *)&c->x86_vendor_id[4]);
+                
+        get_cpu_vendor(c);
+        /* Initialize the standard set of capabilities */
+        /* Note that the vendor-specific code below might override */
+        /* Intel-defined flags: level 0x00000001 */
+        if (c->cpuid_level >= 0x00000001) {
+                __u32 misc;
+                cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
+                      &c->x86_capability[0]);
+                c->x86 = (tfms >> 8) & 0xf;
+                c->x86_model = (tfms >> 4) & 0xf;
+                c->x86_mask = tfms & 0xf;
+                if (c->x86 == 0xf)
+                        c->x86 += (tfms >> 20) & 0xff;
+                if (c->x86 >= 0x6)
+                        c->x86_model += ((tfms >> 16) & 0xF) << 4;
+                if (c->x86_capability[0] & (1<<19)) 
+                        c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+        } else {
+                /* Have CPUID level 0 only - unheard of */
+                c->x86 = 4;
+        }
+#ifdef CONFIG_SMP
+        c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
+#endif
+}
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+        int i;
+        u32 xlvl;
+        early_identify_cpu(c);
+        /* AMD-defined flags: level 0x80000001 */
+        xlvl = cpuid_eax(0x80000000);
+        c->extended_cpuid_level = xlvl;
+        if ((xlvl & 0xffff0000) == 0x80000000) {
+                if (xlvl >= 0x80000001) {
+                        c->x86_capability[1] = cpuid_edx(0x80000001);
+                        c->x86_capability[6] = cpuid_ecx(0x80000001);
+                }
+                if (xlvl >= 0x80000004)
+                        get_model_name(c); /* Default name */
+        }
+        /* Transmeta-defined flags: level 0x80860001 */
+        xlvl = cpuid_eax(0x80860000);
+        if ((xlvl & 0xffff0000) == 0x80860000) {
+                /* Don't set x86_cpuid_level here for now to not confuse. */
+                if (xlvl >= 0x80860001)
+                        c->x86_capability[2] = cpuid_edx(0x80860001);
+        }
+        init_scattered_cpuid_features(c);
+        c->apicid = phys_pkg_id(0);
+        /*
+         * Vendor-specific initialization.  In this section we
+         * canonicalize the feature flags, meaning if there are
+         * features a certain CPU supports which CPUID doesn't
+         * tell us, CPUID claiming incorrect flags, or other bugs,
+         * we handle them here.
+         *
+         * At the end of this section, c->x86_capability better
+         * indicate the features this CPU genuinely supports!
+         */
+        switch (c->x86_vendor) {
+        case X86_VENDOR_AMD:
+                init_amd(c);
+                break;
+        case X86_VENDOR_INTEL:
+                init_intel(c);
+                break;
+        case X86_VENDOR_UNKNOWN:
+        default:
+                display_cacheinfo(c);
+                break;
+        }
+        select_idle_routine(c);
+        detect_ht(c); 
+        /*
+         * On SMP, boot_cpu_data holds the common feature set between
+         * all CPUs; so make sure that we indicate which features are
+         * common between the CPUs.  The first time this routine gets
+         * executed, c == &boot_cpu_data.
+         */
+        if (c != &boot_cpu_data) {
+                /* AND the already accumulated flags with these */
+                for (i = 0 ; i < NCAPINTS ; i++)
+                        boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+        }
+#ifdef CONFIG_X86_MCE
+        mcheck_init(c);
+#endif
+        if (c != &boot_cpu_data)
+                mtrr_ap_init();
+#ifdef CONFIG_NUMA
+        numa_add_cpu(smp_processor_id());
+#endif
+}
+ 
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+{
+        if (c->x86_model_id[0])
+                printk("%s", c->x86_model_id);
+        if (c->x86_mask || c->cpuid_level >= 0) 
+                printk(" stepping %02x\n", c->x86_mask);
+        else
+                printk("\n");
+}
+/*
+ *      Get CPU information for use by the procfs.
+ */
+static int show_cpuinfo(struct seq_file *m, void *v)
+{
+        struct cpuinfo_x86 *c = v;
+        /* 
+         * These flag bits must match the definitions in <asm/cpufeature.h>.
+         * NULL means this bit is undefined or reserved; either way it doesn't
+         * have meaning as far as Linux is concerned.  Note that it's important
+         * to realize there is a difference between this table and CPUID -- if
+         * applications want to get the raw CPUID data, they should access
+         * /dev/cpu/<cpu_nr>/cpuid instead.
+         */
+        static char *x86_cap_flags[] = {
+                /* Intel-defined */
+                "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
+                "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
+                "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
+                "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
+                /* AMD-defined */
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
+                NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
+                "3dnowext", "3dnow",
+                /* Transmeta-defined */
+                "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                /* Other (Linux-defined) */
+                "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
+                NULL, NULL, NULL, NULL,
+                "constant_tsc", "up", NULL, "arch_perfmon",
+                "pebs", "bts", NULL, "sync_rdtsc",
+                "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                /* Intel-defined (#2) */
+                "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
+                "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
+                NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                /* VIA/Cyrix/Centaur-defined */
+                NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
+                "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                /* AMD-defined (#2) */
+                "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy",
+                "altmovcr8", "abm", "sse4a",
+                "misalignsse", "3dnowprefetch",
+                "osvw", "ibs", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                /* Auxiliary (Linux-defined) */
+                "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        };
+        static char *x86_power_flags[] = { 
+                "ts",   /* temperature sensor */
+                "fid",  /* frequency id control */
+                "vid",  /* voltage id control */
+                "ttp",  /* thermal trip */
+                "tm",
+                "stc",
+                "100mhzsteps",
+                "hwpstate",
+                "",     /* tsc invariant mapped to constant_tsc */
+                /* nothing */
+        };
+#ifdef CONFIG_SMP
+        if (!cpu_online(c-cpu_data))
+                return 0;
+#endif
+        seq_printf(m,"processor\t: %u\n"
+                     "vendor_id\t: %s\n"
+                     "cpu family\t: %d\n"
+                     "model\t\t: %d\n"
+                     "model name\t: %s\n",
+                     (unsigned)(c-cpu_data),
+                     c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
+                     c->x86,
+                     (int)c->x86_model,
+                     c->x86_model_id[0] ? c->x86_model_id : "unknown");
+        
+        if (c->x86_mask || c->cpuid_level >= 0)
+                seq_printf(m, "stepping\t: %d\n", c->x86_mask);
+        else
+                seq_printf(m, "stepping\t: unknown\n");
+        
+        if (cpu_has(c,X86_FEATURE_TSC)) {
+                unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
+                if (!freq)
+                        freq = cpu_khz;
+                seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
+                             freq / 1000, (freq % 1000));
+        }
+        /* Cache size */
+        if (c->x86_cache_size >= 0) 
+                seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
+        
+#ifdef CONFIG_SMP
+        if (smp_num_siblings * c->x86_max_cores > 1) {
+                int cpu = c - cpu_data;
+                seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+                seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
+                seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+                seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+        }
+#endif  
+        seq_printf(m,
+                "fpu\t\t: yes\n"
+                "fpu_exception\t: yes\n"
+                "cpuid level\t: %d\n"
+                "wp\t\t: yes\n"
+                "flags\t\t:",
+                   c->cpuid_level);
+        { 
+                int i; 
+                for ( i = 0 ; i < 32*NCAPINTS ; i++ )
+                        if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
+                                seq_printf(m, " %s", x86_cap_flags[i]);
+        }
+                
+        seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+                   c->loops_per_jiffy/(500000/HZ),
+                   (c->loops_per_jiffy/(5000/HZ)) % 100);
+        if (c->x86_tlbsize > 0) 
+                seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
+        seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
+        seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
+        seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 
+                   c->x86_phys_bits, c->x86_virt_bits);
+        seq_printf(m, "power management:");
+        {
+                unsigned i;
+                for (i = 0; i < 32; i++) 
+                        if (c->x86_power & (1 << i)) {
+                                if (i < ARRAY_SIZE(x86_power_flags) &&
+                                        x86_power_flags[i])
+                                        seq_printf(m, "%s%s",
+                                                x86_power_flags[i][0]?" ":"",
+                                                x86_power_flags[i]);
+                                else
+                                        seq_printf(m, " [%d]", i);
+                        }
+        }
+        seq_printf(m, "\n\n");
+        return 0;
+}
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+        return *pos < NR_CPUS ? cpu_data + *pos : NULL;
+}
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        ++*pos;
+        return c_start(m, pos);
+}
+static void c_stop(struct seq_file *m, void *v)
+{
+}
+struct seq_operations cpuinfo_op = {
+        .start =c_start,
+        .next = c_next,
+        .stop = c_stop,
+        .show = show_cpuinfo,
+};
diff --git a/arch/x86/kernel/sigframe_32.h b/arch/x86/kernel/sigframe_32.h
new file mode 100644
index 000000000000..0b2221711dad
--- /dev/null
+++ b/arch/x86/kernel/sigframe_32.h
@@ -0,0 +1,21 @@
+struct sigframe
+{
+        char __user *pretcode;
+        int sig;
+        struct sigcontext sc;
+        struct _fpstate fpstate;
+        unsigned long extramask[_NSIG_WORDS-1];
+        char retcode[8];
+};
+struct rt_sigframe
+{
+        char __user *pretcode;
+        int sig;
+        struct siginfo __user *pinfo;
+        void __user *puc;
+        struct siginfo info;
+        struct ucontext uc;
+        struct _fpstate fpstate;
+        char retcode[8];
+};
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
new file mode 100644
index 000000000000..c03570f7fe8e
--- /dev/null
+++ b/arch/x86/kernel/signal_32.c
@@ -0,0 +1,667 @@
+/*
+ *  linux/arch/i386/kernel/signal.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/suspend.h>
+#include <linux/ptrace.h>
+#include <linux/elf.h>
+#include <linux/binfmts.h>
+#include <asm/processor.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include "sigframe_32.h"
+#define DEBUG_SIG 0
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+/*
+ * Atomically swap in the new signal mask, and wait for a signal.
+ */
+asmlinkage int
+sys_sigsuspend(int history0, int history1, old_sigset_t mask)
+{
+        mask &= _BLOCKABLE;
+        spin_lock_irq(&current->sighand->siglock);
+        current->saved_sigmask = current->blocked;
+        siginitset(&current->blocked, mask);
+        recalc_sigpending();
+        spin_unlock_irq(&current->sighand->siglock);
+        current->state = TASK_INTERRUPTIBLE;
+        schedule();
+        set_thread_flag(TIF_RESTORE_SIGMASK);
+        return -ERESTARTNOHAND;
+}
+asmlinkage int 
+sys_sigaction(int sig, const struct old_sigaction __user *act,
+              struct old_sigaction __user *oact)
+{
+        struct k_sigaction new_ka, old_ka;
+        int ret;
+        if (act) {
+                old_sigset_t mask;
+                if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+                    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
+                    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
+                        return -EFAULT;
+                __get_user(new_ka.sa.sa_flags, &act->sa_flags);
+                __get_user(mask, &act->sa_mask);
+                siginitset(&new_ka.sa.sa_mask, mask);
+        }
+        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+        if (!ret && oact) {
+                if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+                    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
+                    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
+                        return -EFAULT;
+                __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+                __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
+        }
+        return ret;
+}
+asmlinkage int
+sys_sigaltstack(unsigned long ebx)
+{
+        /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
+        struct pt_regs *regs = (struct pt_regs *)&ebx;
+        const stack_t __user *uss = (const stack_t __user *)ebx;
+        stack_t __user *uoss = (stack_t __user *)regs->ecx;
+        return do_sigaltstack(uss, uoss, regs->esp);
+}
+/*
+ * Do a signal return; undo the signal stack.
+ */
+static int
+restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax)
+{
+        unsigned int err = 0;
+        /* Always make any pending restarted system calls return -EINTR */
+        current_thread_info()->restart_block.fn = do_no_restart_syscall;
+#define COPY(x)         err |= __get_user(regs->x, &sc->x)
+#define COPY_SEG(seg)                                                   \
+        { unsigned short tmp;                                           \
+          err |= __get_user(tmp, &sc->seg);                             \
+          regs->x##seg = tmp; }
+#define COPY_SEG_STRICT(seg)                                            \
+        { unsigned short tmp;                                           \
+          err |= __get_user(tmp, &sc->seg);                             \
+          regs->x##seg = tmp|3; }
+#define GET_SEG(seg)                                                    \
+        { unsigned short tmp;                                           \
+          err |= __get_user(tmp, &sc->seg);                             \
+          loadsegment(seg,tmp); }
+#define FIX_EFLAGS      (X86_EFLAGS_AC | X86_EFLAGS_RF |                 \
+                         X86_EFLAGS_OF | X86_EFLAGS_DF |                 \
+                         X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
+                         X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
+        GET_SEG(gs);
+        COPY_SEG(fs);
+        COPY_SEG(es);
+        COPY_SEG(ds);
+        COPY(edi);
+        COPY(esi);
+        COPY(ebp);
+        COPY(esp);
+        COPY(ebx);
+        COPY(edx);
+        COPY(ecx);
+        COPY(eip);
+        COPY_SEG_STRICT(cs);
+        COPY_SEG_STRICT(ss);
+        
+        {
+                unsigned int tmpflags;
+                err |= __get_user(tmpflags, &sc->eflags);
+                regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+                regs->orig_eax = -1;            /* disable syscall checks */
+        }
+        {
+                struct _fpstate __user * buf;
+                err |= __get_user(buf, &sc->fpstate);
+                if (buf) {
+                        if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
+                                goto badframe;
+                        err |= restore_i387(buf);
+                } else {
+                        struct task_struct *me = current;
+                        if (used_math()) {
+                                clear_fpu(me);
+                                clear_used_math();
+                        }
+                }
+        }
+        err |= __get_user(*peax, &sc->eax);
+        return err;
+badframe:
+        return 1;
+}
+asmlinkage int sys_sigreturn(unsigned long __unused)
+{
+        struct pt_regs *regs = (struct pt_regs *) &__unused;
+        struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8);
+        sigset_t set;
+        int eax;
+        if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+                goto badframe;
+        if (__get_user(set.sig[0], &frame->sc.oldmask)
+            || (_NSIG_WORDS > 1
+                && __copy_from_user(&set.sig[1], &frame->extramask,
+                                    sizeof(frame->extramask))))
+                goto badframe;
+        sigdelsetmask(&set, ~_BLOCKABLE);
+        spin_lock_irq(&current->sighand->siglock);
+        current->blocked = set;
+        recalc_sigpending();
+        spin_unlock_irq(&current->sighand->siglock);
+        
+        if (restore_sigcontext(regs, &frame->sc, &eax))
+                goto badframe;
+        return eax;
+badframe:
+        if (show_unhandled_signals && printk_ratelimit())
+                printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
+                       " esp:%lx oeax:%lx\n",
+                    current->pid > 1 ? KERN_INFO : KERN_EMERG,
+                    current->comm, current->pid, frame, regs->eip,
+                    regs->esp, regs->orig_eax);
+        force_sig(SIGSEGV, current);
+        return 0;
+}       
+asmlinkage int sys_rt_sigreturn(unsigned long __unused)
+{
+        struct pt_regs *regs = (struct pt_regs *) &__unused;
+        struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4);
+        sigset_t set;
+        int eax;
+        if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+                goto badframe;
+        if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+                goto badframe;
+        sigdelsetmask(&set, ~_BLOCKABLE);
+        spin_lock_irq(&current->sighand->siglock);
+        current->blocked = set;
+        recalc_sigpending();
+        spin_unlock_irq(&current->sighand->siglock);
+        
+        if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+                goto badframe;
+        if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT)
+                goto badframe;
+        return eax;
+badframe:
+        force_sig(SIGSEGV, current);
+        return 0;
+}       
+/*
+ * Set up a signal frame.
+ */
+static int
+setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
+                 struct pt_regs *regs, unsigned long mask)
+{
+        int tmp, err = 0;
+        err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
+        savesegment(gs, tmp);
+        err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
+        err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
+        err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
+        err |= __put_user(regs->edi, &sc->edi);
+        err |= __put_user(regs->esi, &sc->esi);
+        err |= __put_user(regs->ebp, &sc->ebp);
+        err |= __put_user(regs->esp, &sc->esp);
+        err |= __put_user(regs->ebx, &sc->ebx);
+        err |= __put_user(regs->edx, &sc->edx);
+        err |= __put_user(regs->ecx, &sc->ecx);
+        err |= __put_user(regs->eax, &sc->eax);
+        err |= __put_user(current->thread.trap_no, &sc->trapno);
+        err |= __put_user(current->thread.error_code, &sc->err);
+        err |= __put_user(regs->eip, &sc->eip);
+        err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
+        err |= __put_user(regs->eflags, &sc->eflags);
+        err |= __put_user(regs->esp, &sc->esp_at_signal);
+        err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
+        tmp = save_i387(fpstate);
+        if (tmp < 0)
+          err = 1;
+        else
+          err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
+        /* non-iBCS2 extensions.. */
+        err |= __put_user(mask, &sc->oldmask);
+        err |= __put_user(current->thread.cr2, &sc->cr2);
+        return err;
+}
+/*
+ * Determine which stack to use..
+ */
+static inline void __user *
+get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
+{
+        unsigned long esp;
+        /* Default to using normal stack */
+        esp = regs->esp;
+        /* This is the X/Open sanctioned signal stack switching.  */
+        if (ka->sa.sa_flags & SA_ONSTACK) {
+                if (sas_ss_flags(esp) == 0)
+                        esp = current->sas_ss_sp + current->sas_ss_size;
+        }
+        /* This is the legacy signal stack switching. */
+        else if ((regs->xss & 0xffff) != __USER_DS &&
+                 !(ka->sa.sa_flags & SA_RESTORER) &&
+                 ka->sa.sa_restorer) {
+                esp = (unsigned long) ka->sa.sa_restorer;
+        }
+        esp -= frame_size;
+        /* Align the stack pointer according to the i386 ABI,
+         * i.e. so that on function entry ((sp + 4) & 15) == 0. */
+        esp = ((esp + 4) & -16ul) - 4;
+        return (void __user *) esp;
+}
+/* These symbols are defined with the addresses in the vsyscall page.
+   See vsyscall-sigreturn.S.  */
+extern void __user __kernel_sigreturn;
+extern void __user __kernel_rt_sigreturn;
+static int setup_frame(int sig, struct k_sigaction *ka,
+                       sigset_t *set, struct pt_regs * regs)
+{
+        void __user *restorer;
+        struct sigframe __user *frame;
+        int err = 0;
+        int usig;
+        frame = get_sigframe(ka, regs, sizeof(*frame));
+        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+                goto give_sigsegv;
+        usig = current_thread_info()->exec_domain
+                && current_thread_info()->exec_domain->signal_invmap
+                && sig < 32
+                ? current_thread_info()->exec_domain->signal_invmap[sig]
+                : sig;
+        err = __put_user(usig, &frame->sig);
+        if (err)
+                goto give_sigsegv;
+        err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
+        if (err)
+                goto give_sigsegv;
+        if (_NSIG_WORDS > 1) {
+                err = __copy_to_user(&frame->extramask, &set->sig[1],
+                                      sizeof(frame->extramask));
+                if (err)
+                        goto give_sigsegv;
+        }
+        if (current->binfmt->hasvdso)
+                restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
+        else
+                restorer = (void *)&frame->retcode;
+        if (ka->sa.sa_flags & SA_RESTORER)
+                restorer = ka->sa.sa_restorer;
+        /* Set up to return from userspace.  */
+        err |= __put_user(restorer, &frame->pretcode);
+         
+        /*
+         * This is popl %eax ; movl $,%eax ; int $0x80
+         *
+         * WE DO NOT USE IT ANY MORE! It's only left here for historical
+         * reasons and because gdb uses it as a signature to notice
+         * signal handler stack frames.
+         */
+        err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
+        err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
+        err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
+        if (err)
+                goto give_sigsegv;
+        /* Set up registers for signal handler */
+        regs->esp = (unsigned long) frame;
+        regs->eip = (unsigned long) ka->sa.sa_handler;
+        regs->eax = (unsigned long) sig;
+        regs->edx = (unsigned long) 0;
+        regs->ecx = (unsigned long) 0;
+        set_fs(USER_DS);
+        regs->xds = __USER_DS;
+        regs->xes = __USER_DS;
+        regs->xss = __USER_DS;
+        regs->xcs = __USER_CS;
+        /*
+         * Clear TF when entering the signal handler, but
+         * notify any tracer that was single-stepping it.
+         * The tracer may want to single-step inside the
+         * handler too.
+         */
+        regs->eflags &= ~TF_MASK;
+        if (test_thread_flag(TIF_SINGLESTEP))
+                ptrace_notify(SIGTRAP);
+#if DEBUG_SIG
+        printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+                current->comm, current->pid, frame, regs->eip, frame->pretcode);
+#endif
+        return 0;
+give_sigsegv:
+        force_sigsegv(sig, current);
+        return -EFAULT;
+}
+static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+                           sigset_t *set, struct pt_regs * regs)
+{
+        void __user *restorer;
+        struct rt_sigframe __user *frame;
+        int err = 0;
+        int usig;
+        frame = get_sigframe(ka, regs, sizeof(*frame));
+        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+                goto give_sigsegv;
+        usig = current_thread_info()->exec_domain
+                && current_thread_info()->exec_domain->signal_invmap
+                && sig < 32
+                ? current_thread_info()->exec_domain->signal_invmap[sig]
+                : sig;
+        err |= __put_user(usig, &frame->sig);
+        err |= __put_user(&frame->info, &frame->pinfo);
+        err |= __put_user(&frame->uc, &frame->puc);
+        err |= copy_siginfo_to_user(&frame->info, info);
+        if (err)
+                goto give_sigsegv;
+        /* Create the ucontext.  */
+        err |= __put_user(0, &frame->uc.uc_flags);
+        err |= __put_user(0, &frame->uc.uc_link);
+        err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+        err |= __put_user(sas_ss_flags(regs->esp),
+                          &frame->uc.uc_stack.ss_flags);
+        err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+        err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
+                                regs, set->sig[0]);
+        err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+        if (err)
+                goto give_sigsegv;
+        /* Set up to return from userspace.  */
+        restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
+        if (ka->sa.sa_flags & SA_RESTORER)
+                restorer = ka->sa.sa_restorer;
+        err |= __put_user(restorer, &frame->pretcode);
+         
+        /*
+         * This is movl $,%eax ; int $0x80
+         *
+         * WE DO NOT USE IT ANY MORE! It's only left here for historical
+         * reasons and because gdb uses it as a signature to notice
+         * signal handler stack frames.
+         */
+        err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
+        err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
+        err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
+        if (err)
+                goto give_sigsegv;
+        /* Set up registers for signal handler */
+        regs->esp = (unsigned long) frame;
+        regs->eip = (unsigned long) ka->sa.sa_handler;
+        regs->eax = (unsigned long) usig;
+        regs->edx = (unsigned long) &frame->info;
+        regs->ecx = (unsigned long) &frame->uc;
+        set_fs(USER_DS);
+        regs->xds = __USER_DS;
+        regs->xes = __USER_DS;
+        regs->xss = __USER_DS;
+        regs->xcs = __USER_CS;
+        /*
+         * Clear TF when entering the signal handler, but
+         * notify any tracer that was single-stepping it.
+         * The tracer may want to single-step inside the
+         * handler too.
+         */
+        regs->eflags &= ~TF_MASK;
+        if (test_thread_flag(TIF_SINGLESTEP))
+                ptrace_notify(SIGTRAP);
+#if DEBUG_SIG
+        printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+                current->comm, current->pid, frame, regs->eip, frame->pretcode);
+#endif
+        return 0;
+give_sigsegv:
+        force_sigsegv(sig, current);
+        return -EFAULT;
+}
+/*
+ * OK, we're invoking a handler
+ */     
+static int
+handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
+              sigset_t *oldset, struct pt_regs * regs)
+{
+        int ret;
+        /* Are we from a system call? */
+        if (regs->orig_eax >= 0) {
+                /* If so, check system call restarting.. */
+                switch (regs->eax) {
+                        case -ERESTART_RESTARTBLOCK:
+                        case -ERESTARTNOHAND:
+                                regs->eax = -EINTR;
+                                break;
+                        case -ERESTARTSYS:
+                                if (!(ka->sa.sa_flags & SA_RESTART)) {
+                                        regs->eax = -EINTR;
+                                        break;
+                                }
+                        /* fallthrough */
+                        case -ERESTARTNOINTR:
+                                regs->eax = regs->orig_eax;
+                                regs->eip -= 2;
+                }
+        }
+        /*
+         * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so
+         * that register information in the sigcontext is correct.
+         */
+        if (unlikely(regs->eflags & TF_MASK)
+            && likely(current->ptrace & PT_DTRACE)) {
+                current->ptrace &= ~PT_DTRACE;
+                regs->eflags &= ~TF_MASK;
+        }
+        /* Set up the stack frame */
+        if (ka->sa.sa_flags & SA_SIGINFO)
+                ret = setup_rt_frame(sig, ka, info, oldset, regs);
+        else
+                ret = setup_frame(sig, ka, oldset, regs);
+        if (ret == 0) {
+                spin_lock_irq(&current->sighand->siglock);
+                sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
+                if (!(ka->sa.sa_flags & SA_NODEFER))
+                        sigaddset(&current->blocked,sig);
+                recalc_sigpending();
+                spin_unlock_irq(&current->sighand->siglock);
+        }
+        return ret;
+}
+/*
+ * Note that 'init' is a special process: it doesn't get signals it doesn't
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+static void fastcall do_signal(struct pt_regs *regs)
+{
+        siginfo_t info;
+        int signr;
+        struct k_sigaction ka;
+        sigset_t *oldset;
+        /*
+         * We want the common case to go fast, which
+         * is why we may in certain cases get here from
+         * kernel mode. Just return without doing anything
+         * if so.  vm86 regs switched out by assembly code
+         * before reaching here, so testing against kernel
+         * CS suffices.
+         */
+        if (!user_mode(regs))
+                return;
+        if (test_thread_flag(TIF_RESTORE_SIGMASK))
+                oldset = &current->saved_sigmask;
+        else
+                oldset = &current->blocked;
+        signr = get_signal_to_deliver(&info, &ka, regs, NULL);
+        if (signr > 0) {
+                /* Reenable any watchpoints before delivering the
+                 * signal to user space. The processor register will
+                 * have been cleared if the watchpoint triggered
+                 * inside the kernel.
+                 */
+                if (unlikely(current->thread.debugreg[7]))
+                        set_debugreg(current->thread.debugreg[7], 7);
+                /* Whee!  Actually deliver the signal.  */
+                if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+                        /* a signal was successfully delivered; the saved
+                         * sigmask will have been stored in the signal frame,
+                         * and will be restored by sigreturn, so we can simply
+                         * clear the TIF_RESTORE_SIGMASK flag */
+                        if (test_thread_flag(TIF_RESTORE_SIGMASK))
+                                clear_thread_flag(TIF_RESTORE_SIGMASK);
+                }
+                return;
+        }
+        /* Did we come from a system call? */
+        if (regs->orig_eax >= 0) {
+                /* Restart the system call - no handlers present */
+                switch (regs->eax) {
+                case -ERESTARTNOHAND:
+                case -ERESTARTSYS:
+                case -ERESTARTNOINTR:
+                        regs->eax = regs->orig_eax;
+                        regs->eip -= 2;
+                        break;
+                case -ERESTART_RESTARTBLOCK:
+                        regs->eax = __NR_restart_syscall;
+                        regs->eip -= 2;
+                        break;
+                }
+        }
+        /* if there's no signal to deliver, we just put the saved sigmask
+         * back */
+        if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
+                clear_thread_flag(TIF_RESTORE_SIGMASK);
+                sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+        }
+}
+/*
+ * notification of userspace execution resumption
+ * - triggered by the TIF_WORK_MASK flags
+ */
+__attribute__((regparm(3)))
+void do_notify_resume(struct pt_regs *regs, void *_unused,
+                      __u32 thread_info_flags)
+{
+        /* Pending single-step? */
+        if (thread_info_flags & _TIF_SINGLESTEP) {
+                regs->eflags |= TF_MASK;
+                clear_thread_flag(TIF_SINGLESTEP);
+        }
+        /* deal with pending signal delivery */
+        if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
+                do_signal(regs);
+        
+        clear_thread_flag(TIF_IRET);
+}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
new file mode 100644
index 000000000000..739175b01e06
--- /dev/null
+++ b/arch/x86/kernel/signal_64.c
@@ -0,0 +1,495 @@
+/*
+ *  linux/arch/x86_64/kernel/signal.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ *
+ *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
+ *  2000-2002   x86-64 support by Andi Kleen
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/ptrace.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/compiler.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include <asm/proto.h>
+#include <asm/ia32_unistd.h>
+#include <asm/mce.h>
+/* #define DEBUG_SIG 1 */
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+               sigset_t *set, struct pt_regs * regs); 
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
+            sigset_t *set, struct pt_regs * regs); 
+asmlinkage long
+sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
+                struct pt_regs *regs)
+{
+        return do_sigaltstack(uss, uoss, regs->rsp);
+}
+/*
+ * Do a signal return; undo the signal stack.
+ */
+struct rt_sigframe
+{
+        char __user *pretcode;
+        struct ucontext uc;
+        struct siginfo info;
+};
+static int
+restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax)
+{
+        unsigned int err = 0;
+        /* Always make any pending restarted system calls return -EINTR */
+        current_thread_info()->restart_block.fn = do_no_restart_syscall;
+#define COPY(x)         err |= __get_user(regs->x, &sc->x)
+        COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
+        COPY(rdx); COPY(rcx); COPY(rip);
+        COPY(r8);
+        COPY(r9);
+        COPY(r10);
+        COPY(r11);
+        COPY(r12);
+        COPY(r13);
+        COPY(r14);
+        COPY(r15);
+        /* Kernel saves and restores only the CS segment register on signals,
+         * which is the bare minimum needed to allow mixed 32/64-bit code.
+         * App's signal handler can save/restore other segments if needed. */
+        {
+                unsigned cs;
+                err |= __get_user(cs, &sc->cs);
+                regs->cs = cs | 3;      /* Force into user mode */
+        }
+        {
+                unsigned int tmpflags;
+                err |= __get_user(tmpflags, &sc->eflags);
+                regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
+                regs->orig_rax = -1;            /* disable syscall checks */
+        }
+        {
+                struct _fpstate __user * buf;
+                err |= __get_user(buf, &sc->fpstate);
+                if (buf) {
+                        if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
+                                goto badframe;
+                        err |= restore_i387(buf);
+                } else {
+                        struct task_struct *me = current;
+                        if (used_math()) {
+                                clear_fpu(me);
+                                clear_used_math();
+                        }
+                }
+        }
+        err |= __get_user(*prax, &sc->rax);
+        return err;
+badframe:
+        return 1;
+}
+asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
+{
+        struct rt_sigframe __user *frame;
+        sigset_t set;
+        unsigned long eax;
+        frame = (struct rt_sigframe __user *)(regs->rsp - 8);
+        if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
+                goto badframe;
+        } 
+        if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { 
+                goto badframe;
+        } 
+        sigdelsetmask(&set, ~_BLOCKABLE);
+        spin_lock_irq(&current->sighand->siglock);
+        current->blocked = set;
+        recalc_sigpending();
+        spin_unlock_irq(&current->sighand->siglock);
+        
+        if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+                goto badframe;
+#ifdef DEBUG_SIG
+        printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax);
+#endif
+        if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
+                goto badframe;
+        return eax;
+badframe:
+        signal_fault(regs,frame,"sigreturn");
+        return 0;
+}       
+/*
+ * Set up a signal frame.
+ */
+static inline int
+setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
+{
+        int err = 0;
+        err |= __put_user(regs->cs, &sc->cs);
+        err |= __put_user(0, &sc->gs);
+        err |= __put_user(0, &sc->fs);
+        err |= __put_user(regs->rdi, &sc->rdi);
+        err |= __put_user(regs->rsi, &sc->rsi);
+        err |= __put_user(regs->rbp, &sc->rbp);
+        err |= __put_user(regs->rsp, &sc->rsp);
+        err |= __put_user(regs->rbx, &sc->rbx);
+        err |= __put_user(regs->rdx, &sc->rdx);
+        err |= __put_user(regs->rcx, &sc->rcx);
+        err |= __put_user(regs->rax, &sc->rax);
+        err |= __put_user(regs->r8, &sc->r8);
+        err |= __put_user(regs->r9, &sc->r9);
+        err |= __put_user(regs->r10, &sc->r10);
+        err |= __put_user(regs->r11, &sc->r11);
+        err |= __put_user(regs->r12, &sc->r12);
+        err |= __put_user(regs->r13, &sc->r13);
+        err |= __put_user(regs->r14, &sc->r14);
+        err |= __put_user(regs->r15, &sc->r15);
+        err |= __put_user(me->thread.trap_no, &sc->trapno);
+        err |= __put_user(me->thread.error_code, &sc->err);
+        err |= __put_user(regs->rip, &sc->rip);
+        err |= __put_user(regs->eflags, &sc->eflags);
+        err |= __put_user(mask, &sc->oldmask);
+        err |= __put_user(me->thread.cr2, &sc->cr2);
+        return err;
+}
+/*
+ * Determine which stack to use..
+ */
+static void __user *
+get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
+{
+        unsigned long rsp;
+        /* Default to using normal stack - redzone*/
+        rsp = regs->rsp - 128;
+        /* This is the X/Open sanctioned signal stack switching.  */
+        if (ka->sa.sa_flags & SA_ONSTACK) {
+                if (sas_ss_flags(rsp) == 0)
+                        rsp = current->sas_ss_sp + current->sas_ss_size;
+        }
+        return (void __user *)round_down(rsp - size, 16); 
+}
+static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+                           sigset_t *set, struct pt_regs * regs)
+{
+        struct rt_sigframe __user *frame;
+        struct _fpstate __user *fp = NULL; 
+        int err = 0;
+        struct task_struct *me = current;
+        if (used_math()) {
+                fp = get_stack(ka, regs, sizeof(struct _fpstate)); 
+                frame = (void __user *)round_down(
+                        (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
+                if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
+                        goto give_sigsegv;
+                if (save_i387(fp) < 0) 
+                        err |= -1; 
+        } else
+                frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
+        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+                goto give_sigsegv;
+        if (ka->sa.sa_flags & SA_SIGINFO) { 
+                err |= copy_siginfo_to_user(&frame->info, info);
+                if (err)
+                        goto give_sigsegv;
+        }
+                
+        /* Create the ucontext.  */
+        err |= __put_user(0, &frame->uc.uc_flags);
+        err |= __put_user(0, &frame->uc.uc_link);
+        err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+        err |= __put_user(sas_ss_flags(regs->rsp),
+                          &frame->uc.uc_stack.ss_flags);
+        err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
+        err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
+        err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
+        if (sizeof(*set) == 16) { 
+                __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
+                __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 
+        } else
+                err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+        /* Set up to return from userspace.  If provided, use a stub
+           already in userspace.  */
+        /* x86-64 should always use SA_RESTORER. */
+        if (ka->sa.sa_flags & SA_RESTORER) {
+                err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
+        } else {
+                /* could use a vstub here */
+                goto give_sigsegv; 
+        }
+        if (err)
+                goto give_sigsegv;
+#ifdef DEBUG_SIG
+        printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
+#endif
+        /* Set up registers for signal handler */
+        regs->rdi = sig;
+        /* In case the signal handler was declared without prototypes */ 
+        regs->rax = 0;  
+        /* This also works for non SA_SIGINFO handlers because they expect the
+           next argument after the signal number on the stack. */
+        regs->rsi = (unsigned long)&frame->info; 
+        regs->rdx = (unsigned long)&frame->uc; 
+        regs->rip = (unsigned long) ka->sa.sa_handler;
+        regs->rsp = (unsigned long)frame;
+        /* Set up the CS register to run signal handlers in 64-bit mode,
+           even if the handler happens to be interrupting 32-bit code. */
+        regs->cs = __USER_CS;
+        /* This, by contrast, has nothing to do with segment registers -
+           see include/asm-x86_64/uaccess.h for details. */
+        set_fs(USER_DS);
+        regs->eflags &= ~TF_MASK;
+        if (test_thread_flag(TIF_SINGLESTEP))
+                ptrace_notify(SIGTRAP);
+#ifdef DEBUG_SIG
+        printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
+                current->comm, current->pid, frame, regs->rip, frame->pretcode);
+#endif
+        return 0;
+give_sigsegv:
+        force_sigsegv(sig, current);
+        return -EFAULT;
+}
+/*
+ * OK, we're invoking a handler
+ */     
+static int
+handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
+                sigset_t *oldset, struct pt_regs *regs)
+{
+        int ret;
+#ifdef DEBUG_SIG
+        printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
+                current->pid, sig,
+                regs->rip, regs->rsp, regs);
+#endif
+        /* Are we from a system call? */
+        if ((long)regs->orig_rax >= 0) {
+                /* If so, check system call restarting.. */
+                switch (regs->rax) {
+                        case -ERESTART_RESTARTBLOCK:
+                        case -ERESTARTNOHAND:
+                                regs->rax = -EINTR;
+                                break;
+                        case -ERESTARTSYS:
+                                if (!(ka->sa.sa_flags & SA_RESTART)) {
+                                        regs->rax = -EINTR;
+                                        break;
+                                }
+                                /* fallthrough */
+                        case -ERESTARTNOINTR:
+                                regs->rax = regs->orig_rax;
+                                regs->rip -= 2;
+                                break;
+                }
+        }
+        /*
+         * If TF is set due to a debugger (PT_DTRACE), clear the TF
+         * flag so that register information in the sigcontext is
+         * correct.
+         */
+        if (unlikely(regs->eflags & TF_MASK)) {
+                if (likely(current->ptrace & PT_DTRACE)) {
+                        current->ptrace &= ~PT_DTRACE;
+                        regs->eflags &= ~TF_MASK;
+                }
+        }
+#ifdef CONFIG_IA32_EMULATION
+        if (test_thread_flag(TIF_IA32)) {
+                if (ka->sa.sa_flags & SA_SIGINFO)
+                        ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
+                else
+                        ret = ia32_setup_frame(sig, ka, oldset, regs);
+        } else 
+#endif
+        ret = setup_rt_frame(sig, ka, info, oldset, regs);
+        if (ret == 0) {
+                spin_lock_irq(&current->sighand->siglock);
+                sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
+                if (!(ka->sa.sa_flags & SA_NODEFER))
+                        sigaddset(&current->blocked,sig);
+                recalc_sigpending();
+                spin_unlock_irq(&current->sighand->siglock);
+        }
+        return ret;
+}
+/*
+ * Note that 'init' is a special process: it doesn't get signals it doesn't
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+static void do_signal(struct pt_regs *regs)
+{
+        struct k_sigaction ka;
+        siginfo_t info;
+        int signr;
+        sigset_t *oldset;
+        /*
+         * We want the common case to go fast, which
+         * is why we may in certain cases get here from
+         * kernel mode. Just return without doing anything
+         * if so.
+         */
+        if (!user_mode(regs))
+                return;
+        if (test_thread_flag(TIF_RESTORE_SIGMASK))
+                oldset = &current->saved_sigmask;
+        else
+                oldset = &current->blocked;
+        signr = get_signal_to_deliver(&info, &ka, regs, NULL);
+        if (signr > 0) {
+                /* Reenable any watchpoints before delivering the
+                 * signal to user space. The processor register will
+                 * have been cleared if the watchpoint triggered
+                 * inside the kernel.
+                 */
+                if (current->thread.debugreg7)
+                        set_debugreg(current->thread.debugreg7, 7);
+                /* Whee!  Actually deliver the signal.  */
+                if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+                        /* a signal was successfully delivered; the saved
+                         * sigmask will have been stored in the signal frame,
+                         * and will be restored by sigreturn, so we can simply
+                         * clear the TIF_RESTORE_SIGMASK flag */
+                        clear_thread_flag(TIF_RESTORE_SIGMASK);
+                }
+                return;
+        }
+        /* Did we come from a system call? */
+        if ((long)regs->orig_rax >= 0) {
+                /* Restart the system call - no handlers present */
+                long res = regs->rax;
+                switch (res) {
+                case -ERESTARTNOHAND:
+                case -ERESTARTSYS:
+                case -ERESTARTNOINTR:
+                        regs->rax = regs->orig_rax;
+                        regs->rip -= 2;
+                        break;
+                case -ERESTART_RESTARTBLOCK:
+                        regs->rax = test_thread_flag(TIF_IA32) ?
+                                        __NR_ia32_restart_syscall :
+                                        __NR_restart_syscall;
+                        regs->rip -= 2;
+                        break;
+                }
+        }
+        /* if there's no signal to deliver, we just put the saved sigmask
+           back. */
+        if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
+                clear_thread_flag(TIF_RESTORE_SIGMASK);
+                sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+        }
+}
+void
+do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
+{
+#ifdef DEBUG_SIG
+        printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n",
+               thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); 
+#endif
+               
+        /* Pending single-step? */
+        if (thread_info_flags & _TIF_SINGLESTEP) {
+                regs->eflags |= TF_MASK;
+                clear_thread_flag(TIF_SINGLESTEP);
+        }
+#ifdef CONFIG_X86_MCE
+        /* notify userspace of pending MCEs */
+        if (thread_info_flags & _TIF_MCE_NOTIFY)
+                mce_notify_user();
+#endif /* CONFIG_X86_MCE */
+        /* deal with pending signal delivery */
+        if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
+                do_signal(regs);
+}
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
+{ 
+        struct task_struct *me = current; 
+        if (show_unhandled_signals && printk_ratelimit())
+                printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
+               me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); 
+        force_sig(SIGSEGV, me); 
+} 
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
new file mode 100644
index 000000000000..2d35d8502029
--- /dev/null
+++ b/arch/x86/kernel/smp_32.c
@@ -0,0 +1,707 @@
+/*
+ *      Intel SMP support routines.
+ *
+ *      (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *      (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *      This code is released under the GNU General Public License version 2 or
+ *      later.
+ */
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <mach_apic.h>
+/*
+ *      Some notes on x86 processor bugs affecting SMP operation:
+ *
+ *      Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ *      The Linux implications for SMP are handled as follows:
+ *
+ *      Pentium III / [Xeon]
+ *              None of the E1AP-E3AP errata are visible to the user.
+ *
+ *      E1AP.   see PII A1AP
+ *      E2AP.   see PII A2AP
+ *      E3AP.   see PII A3AP
+ *
+ *      Pentium II / [Xeon]
+ *              None of the A1AP-A3AP errata are visible to the user.
+ *
+ *      A1AP.   see PPro 1AP
+ *      A2AP.   see PPro 2AP
+ *      A3AP.   see PPro 7AP
+ *
+ *      Pentium Pro
+ *              None of 1AP-9AP errata are visible to the normal user,
+ *      except occasional delivery of 'spurious interrupt' as trap #15.
+ *      This is very rare and a non-problem.
+ *
+ *      1AP.    Linux maps APIC as non-cacheable
+ *      2AP.    worked around in hardware
+ *      3AP.    fixed in C0 and above steppings microcode update.
+ *              Linux does not use excessive STARTUP_IPIs.
+ *      4AP.    worked around in hardware
+ *      5AP.    symmetric IO mode (normal Linux operation) not affected.
+ *              'noapic' mode has vector 0xf filled out properly.
+ *      6AP.    'noapic' mode might be affected - fixed in later steppings
+ *      7AP.    We do not assume writes to the LVT deassering IRQs
+ *      8AP.    We do not enable low power mode (deep sleep) during MP bootup
+ *      9AP.    We do not use mixed mode
+ *
+ *      Pentium
+ *              There is a marginal case where REP MOVS on 100MHz SMP
+ *      machines with B stepping processors can fail. XXX should provide
+ *      an L1cache=Writethrough or L1cache=off option.
+ *
+ *              B stepping CPUs may hang. There are hardware work arounds
+ *      for this. We warn about it in case your board doesn't have the work
+ *      arounds. Basically thats so I can tell anyone with a B stepping
+ *      CPU and SMP problems "tough".
+ *
+ *      Specific items [From Pentium Processor Specification Update]
+ *
+ *      1AP.    Linux doesn't use remote read
+ *      2AP.    Linux doesn't trust APIC errors
+ *      3AP.    We work around this
+ *      4AP.    Linux never generated 3 interrupts of the same priority
+ *              to cause a lost local interrupt.
+ *      5AP.    Remote read is never used
+ *      6AP.    not affected - worked around in hardware
+ *      7AP.    not affected - worked around in hardware
+ *      8AP.    worked around in hardware - we get explicit CS errors if not
+ *      9AP.    only 'noapic' mode affected. Might generate spurious
+ *              interrupts, we log only the first one and count the
+ *              rest silently.
+ *      10AP.   not affected - worked around in hardware
+ *      11AP.   Linux reads the APIC between writes to avoid this, as per
+ *              the documentation. Make sure you preserve this as it affects
+ *              the C stepping chips too.
+ *      12AP.   not affected - worked around in hardware
+ *      13AP.   not affected - worked around in hardware
+ *      14AP.   we always deassert INIT during bootup
+ *      15AP.   not affected - worked around in hardware
+ *      16AP.   not affected - worked around in hardware
+ *      17AP.   not affected - worked around in hardware
+ *      18AP.   not affected - worked around in hardware
+ *      19AP.   not affected - worked around in BIOS
+ *
+ *      If this sounds worrying believe me these bugs are either ___RARE___,
+ *      or are signal timing bugs worked around in hardware and there's
+ *      about nothing of note with C stepping upwards.
+ */
+DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ */
+static inline int __prepare_ICR (unsigned int shortcut, int vector)
+{
+        unsigned int icr = shortcut | APIC_DEST_LOGICAL;
+        switch (vector) {
+        default:
+                icr |= APIC_DM_FIXED | vector;
+                break;
+        case NMI_VECTOR:
+                icr |= APIC_DM_NMI;
+                break;
+        }
+        return icr;
+}
+static inline int __prepare_ICR2 (unsigned int mask)
+{
+        return SET_APIC_DEST_FIELD(mask);
+}
+void __send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+        /*
+         * Subtle. In the case of the 'never do double writes' workaround
+         * we have to lock out interrupts to be safe.  As we don't care
+         * of the value read we use an atomic rmw access to avoid costly
+         * cli/sti.  Otherwise we use an even cheaper single atomic write
+         * to the APIC.
+         */
+        unsigned int cfg;
+        /*
+         * Wait for idle.
+         */
+        apic_wait_icr_idle();
+        /*
+         * No need to touch the target chip field
+         */
+        cfg = __prepare_ICR(shortcut, vector);
+        /*
+         * Send the IPI. The write to APIC_ICR fires this off.
+         */
+        apic_write_around(APIC_ICR, cfg);
+}
+void fastcall send_IPI_self(int vector)
+{
+        __send_IPI_shortcut(APIC_DEST_SELF, vector);
+}
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+static inline void __send_IPI_dest_field(unsigned long mask, int vector)
+{
+        unsigned long cfg;
+        /*
+         * Wait for idle.
+         */
+        if (unlikely(vector == NMI_VECTOR))
+                safe_apic_wait_icr_idle();
+        else
+                apic_wait_icr_idle();
+                
+        /*
+         * prepare target chip field
+         */
+        cfg = __prepare_ICR2(mask);
+        apic_write_around(APIC_ICR2, cfg);
+                
+        /*
+         * program the ICR 
+         */
+        cfg = __prepare_ICR(0, vector);
+                        
+        /*
+         * Send the IPI. The write to APIC_ICR fires this off.
+         */
+        apic_write_around(APIC_ICR, cfg);
+}
+/*
+ * This is only used on smaller machines.
+ */
+void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+{
+        unsigned long mask = cpus_addr(cpumask)[0];
+        unsigned long flags;
+        local_irq_save(flags);
+        WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+        __send_IPI_dest_field(mask, vector);
+        local_irq_restore(flags);
+}
+void send_IPI_mask_sequence(cpumask_t mask, int vector)
+{
+        unsigned long flags;
+        unsigned int query_cpu;
+        /*
+         * Hack. The clustered APIC addressing mode doesn't allow us to send 
+         * to an arbitrary mask, so I do a unicasts to each CPU instead. This 
+         * should be modified to do 1 message per cluster ID - mbligh
+         */ 
+        local_irq_save(flags);
+        for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
+                if (cpu_isset(query_cpu, mask)) {
+                        __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
+                                              vector);
+                }
+        }
+        local_irq_restore(flags);
+}
+#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
+/*
+ *      Smarter SMP flushing macros. 
+ *              c/o Linus Torvalds.
+ *
+ *      These mean you can really definitely utterly forget about
+ *      writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *      Optimizations Manfred Spraul <manfred@colorfullife.com>
+ */
+static cpumask_t flush_cpumask;
+static struct mm_struct * flush_mm;
+static unsigned long flush_va;
+static DEFINE_SPINLOCK(tlbstate_lock);
+/*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+ *
+ * We need to reload %cr3 since the page tables may be going
+ * away from under us..
+ */
+void leave_mm(unsigned long cpu)
+{
+        if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
+                BUG();
+        cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
+        load_cr3(swapper_pg_dir);
+}
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ *      Stop ipi delivery for the old mm. This is not synchronized with
+ *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *      for the wrong mm, and in the worst case we perform a superflous
+ *      tlb flush.
+ * 1a2) set cpu_tlbstate to TLBSTATE_OK
+ *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *      was in lazy tlb mode.
+ * 1a3) update cpu_tlbstate[].active_mm
+ *      Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ *      Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ *      cpu_tlbstate[].active_mm is correct, cpu0 already handles
+ *      flush ipis.
+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ *      Atomically set the bit [other cpus will start sending flush ipis],
+ *      and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu_tlbstate is local to each cpu, no
+ * write/read ordering problems.
+ */
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ */
+fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
+{
+        unsigned long cpu;
+        cpu = get_cpu();
+        if (!cpu_isset(cpu, flush_cpumask))
+                goto out;
+                /* 
+                 * This was a BUG() but until someone can quote me the
+                 * line from the intel manual that guarantees an IPI to
+                 * multiple CPUs is retried _only_ on the erroring CPUs
+                 * its staying as a return
+                 *
+                 * BUG();
+                 */
+                 
+        if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
+                if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
+                        if (flush_va == TLB_FLUSH_ALL)
+                                local_flush_tlb();
+                        else
+                                __flush_tlb_one(flush_va);
+                } else
+                        leave_mm(cpu);
+        }
+        ack_APIC_irq();
+        smp_mb__before_clear_bit();
+        cpu_clear(cpu, flush_cpumask);
+        smp_mb__after_clear_bit();
+out:
+        put_cpu_no_resched();
+}
+void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
+                             unsigned long va)
+{
+        cpumask_t cpumask = *cpumaskp;
+        /*
+         * A couple of (to be removed) sanity checks:
+         *
+         * - current CPU must not be in mask
+         * - mask must exist :)
+         */
+        BUG_ON(cpus_empty(cpumask));
+        BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+        BUG_ON(!mm);
+#ifdef CONFIG_HOTPLUG_CPU
+        /* If a CPU which we ran on has gone down, OK. */
+        cpus_and(cpumask, cpumask, cpu_online_map);
+        if (unlikely(cpus_empty(cpumask)))
+                return;
+#endif
+        /*
+         * i'm not happy about this global shared spinlock in the
+         * MM hot path, but we'll see how contended it is.
+         * AK: x86-64 has a faster method that could be ported.
+         */
+        spin_lock(&tlbstate_lock);
+        
+        flush_mm = mm;
+        flush_va = va;
+        cpus_or(flush_cpumask, cpumask, flush_cpumask);
+        /*
+         * We have to send the IPI only to
+         * CPUs affected.
+         */
+        send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+        while (!cpus_empty(flush_cpumask))
+                /* nothing. lockup detection does not belong here */
+                cpu_relax();
+        flush_mm = NULL;
+        flush_va = 0;
+        spin_unlock(&tlbstate_lock);
+}
+        
+void flush_tlb_current_task(void)
+{
+        struct mm_struct *mm = current->mm;
+        cpumask_t cpu_mask;
+        preempt_disable();
+        cpu_mask = mm->cpu_vm_mask;
+        cpu_clear(smp_processor_id(), cpu_mask);
+        local_flush_tlb();
+        if (!cpus_empty(cpu_mask))
+                flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+        preempt_enable();
+}
+void flush_tlb_mm (struct mm_struct * mm)
+{
+        cpumask_t cpu_mask;
+        preempt_disable();
+        cpu_mask = mm->cpu_vm_mask;
+        cpu_clear(smp_processor_id(), cpu_mask);
+        if (current->active_mm == mm) {
+                if (current->mm)
+                        local_flush_tlb();
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (!cpus_empty(cpu_mask))
+                flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+        preempt_enable();
+}
+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        cpumask_t cpu_mask;
+        preempt_disable();
+        cpu_mask = mm->cpu_vm_mask;
+        cpu_clear(smp_processor_id(), cpu_mask);
+        if (current->active_mm == mm) {
+                if(current->mm)
+                        __flush_tlb_one(va);
+                 else
+                        leave_mm(smp_processor_id());
+        }
+        if (!cpus_empty(cpu_mask))
+                flush_tlb_others(cpu_mask, mm, va);
+        preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_page);
+static void do_flush_tlb_all(void* info)
+{
+        unsigned long cpu = smp_processor_id();
+        __flush_tlb_all();
+        if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
+                leave_mm(cpu);
+}
+void flush_tlb_all(void)
+{
+        on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+}
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+static void native_smp_send_reschedule(int cpu)
+{
+        WARN_ON(cpu_is_offline(cpu));
+        send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+}
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+struct call_data_struct {
+        void (*func) (void *info);
+        void *info;
+        atomic_t started;
+        atomic_t finished;
+        int wait;
+};
+void lock_ipi_call_lock(void)
+{
+        spin_lock_irq(&call_lock);
+}
+void unlock_ipi_call_lock(void)
+{
+        spin_unlock_irq(&call_lock);
+}
+static struct call_data_struct *call_data;
+static void __smp_call_function(void (*func) (void *info), void *info,
+                                int nonatomic, int wait)
+{
+        struct call_data_struct data;
+        int cpus = num_online_cpus() - 1;
+        if (!cpus)
+                return;
+        data.func = func;
+        data.info = info;
+        atomic_set(&data.started, 0);
+        data.wait = wait;
+        if (wait)
+                atomic_set(&data.finished, 0);
+        call_data = &data;
+        mb();
+        
+        /* Send a message to all other CPUs and wait for them to respond */
+        send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+        /* Wait for response */
+        while (atomic_read(&data.started) != cpus)
+                cpu_relax();
+        if (wait)
+                while (atomic_read(&data.finished) != cpus)
+                        cpu_relax();
+}
+/**
+ * smp_call_function_mask(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on.  Must not include the current cpu.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+  * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+static int
+native_smp_call_function_mask(cpumask_t mask,
+                              void (*func)(void *), void *info,
+                              int wait)
+{
+        struct call_data_struct data;
+        cpumask_t allbutself;
+        int cpus;
+        /* Can deadlock when called with interrupts disabled */
+        WARN_ON(irqs_disabled());
+        /* Holding any lock stops cpus from going down. */
+        spin_lock(&call_lock);
+        allbutself = cpu_online_map;
+        cpu_clear(smp_processor_id(), allbutself);
+        cpus_and(mask, mask, allbutself);
+        cpus = cpus_weight(mask);
+        if (!cpus) {
+                spin_unlock(&call_lock);
+                return 0;
+        }
+        data.func = func;
+        data.info = info;
+        atomic_set(&data.started, 0);
+        data.wait = wait;
+        if (wait)
+                atomic_set(&data.finished, 0);
+        call_data = &data;
+        mb();
+        /* Send a message to other CPUs */
+        if (cpus_equal(mask, allbutself))
+                send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+        else
+                send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+        /* Wait for response */
+        while (atomic_read(&data.started) != cpus)
+                cpu_relax();
+        if (wait)
+                while (atomic_read(&data.finished) != cpus)
+                        cpu_relax();
+        spin_unlock(&call_lock);
+        return 0;
+}
+static void stop_this_cpu (void * dummy)
+{
+        local_irq_disable();
+        /*
+         * Remove this CPU:
+         */
+        cpu_clear(smp_processor_id(), cpu_online_map);
+        disable_local_APIC();
+        if (cpu_data[smp_processor_id()].hlt_works_ok)
+                for(;;) halt();
+        for (;;);
+}
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+static void native_smp_send_stop(void)
+{
+        /* Don't deadlock on the call lock in panic */
+        int nolock = !spin_trylock(&call_lock);
+        unsigned long flags;
+        local_irq_save(flags);
+        __smp_call_function(stop_this_cpu, NULL, 0, 0);
+        if (!nolock)
+                spin_unlock(&call_lock);
+        disable_local_APIC();
+        local_irq_restore(flags);
+}
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+{
+        ack_APIC_irq();
+}
+fastcall void smp_call_function_interrupt(struct pt_regs *regs)
+{
+        void (*func) (void *info) = call_data->func;
+        void *info = call_data->info;
+        int wait = call_data->wait;
+        ack_APIC_irq();
+        /*
+         * Notify initiating CPU that I've grabbed the data and am
+         * about to execute the function
+         */
+        mb();
+        atomic_inc(&call_data->started);
+        /*
+         * At this point the info structure may be out of scope unless wait==1
+         */
+        irq_enter();
+        (*func)(info);
+        irq_exit();
+        if (wait) {
+                mb();
+                atomic_inc(&call_data->finished);
+        }
+}
+static int convert_apicid_to_cpu(int apic_id)
+{
+        int i;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (x86_cpu_to_apicid[i] == apic_id)
+                        return i;
+        }
+        return -1;
+}
+int safe_smp_processor_id(void)
+{
+        int apicid, cpuid;
+        if (!boot_cpu_has(X86_FEATURE_APIC))
+                return 0;
+        apicid = hard_smp_processor_id();
+        if (apicid == BAD_APICID)
+                return 0;
+        cpuid = convert_apicid_to_cpu(apicid);
+        return cpuid >= 0 ? cpuid : 0;
+}
+struct smp_ops smp_ops = {
+        .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
+        .smp_prepare_cpus = native_smp_prepare_cpus,
+        .cpu_up = native_cpu_up,
+        .smp_cpus_done = native_smp_cpus_done,
+        .smp_send_stop = native_smp_send_stop,
+        .smp_send_reschedule = native_smp_send_reschedule,
+        .smp_call_function_mask = native_smp_call_function_mask,
+};
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
new file mode 100644
index 000000000000..df4a82812adb
--- /dev/null
+++ b/arch/x86/kernel/smp_64.c
@@ -0,0 +1,523 @@
+/*
+ *      Intel SMP support routines.
+ *
+ *      (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *      (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ *      (c) 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ *      This code is released under the GNU General Public License version 2 or
+ *      later.
+ */
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/interrupt.h>
+#include <asm/mtrr.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mach_apic.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/apicdef.h>
+#include <asm/idle.h>
+/*
+ *      Smarter SMP flushing macros. 
+ *              c/o Linus Torvalds.
+ *
+ *      These mean you can really definitely utterly forget about
+ *      writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *      Optimizations Manfred Spraul <manfred@colorfullife.com>
+ *
+ *      More scalable flush, from Andi Kleen
+ *
+ *      To avoid global state use 8 different call vectors.
+ *      Each CPU uses a specific vector to trigger flushes on other
+ *      CPUs. Depending on the received vector the target CPUs look into
+ *      the right per cpu variable for the flush data.
+ *
+ *      With more than 8 CPUs they are hashed to the 8 available
+ *      vectors. The limited global vector space forces us to this right now.
+ *      In future when interrupts are split into per CPU domains this could be
+ *      fixed, at the cost of triggering multiple IPIs in some cases.
+ */
+union smp_flush_state {
+        struct {
+                cpumask_t flush_cpumask;
+                struct mm_struct *flush_mm;
+                unsigned long flush_va;
+#define FLUSH_ALL       -1ULL
+                spinlock_t tlbstate_lock;
+        };
+        char pad[SMP_CACHE_BYTES];
+} ____cacheline_aligned;
+/* State is put into the per CPU data section, but padded
+   to a full cache line because other CPUs can access it and we don't
+   want false sharing in the per cpu data segment. */
+static DEFINE_PER_CPU(union smp_flush_state, flush_state);
+/*
+ * We cannot call mmdrop() because we are in interrupt context, 
+ * instead update mm->cpu_vm_mask.
+ */
+static inline void leave_mm(int cpu)
+{
+        if (read_pda(mmu_state) == TLBSTATE_OK)
+                BUG();
+        cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
+        load_cr3(swapper_pg_dir);
+}
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ *      Stop ipi delivery for the old mm. This is not synchronized with
+ *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *      for the wrong mm, and in the worst case we perform a superfluous
+ *      tlb flush.
+ * 1a2) set cpu mmu_state to TLBSTATE_OK
+ *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *      was in lazy tlb mode.
+ * 1a3) update cpu active_mm
+ *      Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ *      Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ *      cpu active_mm is correct, cpu0 already handles
+ *      flush ipis.
+ * 1b1) set cpu mmu_state to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ *      Atomically set the bit [other cpus will start sending flush ipis],
+ *      and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu mmu_state is local to each cpu, no
+ * write/read ordering problems.
+ */
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ *
+ * Interrupts are disabled.
+ */
+asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
+{
+        int cpu;
+        int sender;
+        union smp_flush_state *f;
+        cpu = smp_processor_id();
+        /*
+         * orig_rax contains the negated interrupt vector.
+         * Use that to determine where the sender put the data.
+         */
+        sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
+        f = &per_cpu(flush_state, sender);
+        if (!cpu_isset(cpu, f->flush_cpumask))
+                goto out;
+                /* 
+                 * This was a BUG() but until someone can quote me the
+                 * line from the intel manual that guarantees an IPI to
+                 * multiple CPUs is retried _only_ on the erroring CPUs
+                 * its staying as a return
+                 *
+                 * BUG();
+                 */
+                 
+        if (f->flush_mm == read_pda(active_mm)) {
+                if (read_pda(mmu_state) == TLBSTATE_OK) {
+                        if (f->flush_va == FLUSH_ALL)
+                                local_flush_tlb();
+                        else
+                                __flush_tlb_one(f->flush_va);
+                } else
+                        leave_mm(cpu);
+        }
+out:
+        ack_APIC_irq();
+        cpu_clear(cpu, f->flush_cpumask);
+}
+static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
+                                                unsigned long va)
+{
+        int sender;
+        union smp_flush_state *f;
+        /* Caller has disabled preemption */
+        sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+        f = &per_cpu(flush_state, sender);
+        /* Could avoid this lock when
+           num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
+           probably not worth checking this for a cache-hot lock. */
+        spin_lock(&f->tlbstate_lock);
+        f->flush_mm = mm;
+        f->flush_va = va;
+        cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
+        /*
+         * We have to send the IPI only to
+         * CPUs affected.
+         */
+        send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
+        while (!cpus_empty(f->flush_cpumask))
+                cpu_relax();
+        f->flush_mm = NULL;
+        f->flush_va = 0;
+        spin_unlock(&f->tlbstate_lock);
+}
+int __cpuinit init_smp_flush(void)
+{
+        int i;
+        for_each_cpu_mask(i, cpu_possible_map) {
+                spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
+        }
+        return 0;
+}
+core_initcall(init_smp_flush);
+        
+void flush_tlb_current_task(void)
+{
+        struct mm_struct *mm = current->mm;
+        cpumask_t cpu_mask;
+        preempt_disable();
+        cpu_mask = mm->cpu_vm_mask;
+        cpu_clear(smp_processor_id(), cpu_mask);
+        local_flush_tlb();
+        if (!cpus_empty(cpu_mask))
+                flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+        preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_current_task);
+void flush_tlb_mm (struct mm_struct * mm)
+{
+        cpumask_t cpu_mask;
+        preempt_disable();
+        cpu_mask = mm->cpu_vm_mask;
+        cpu_clear(smp_processor_id(), cpu_mask);
+        if (current->active_mm == mm) {
+                if (current->mm)
+                        local_flush_tlb();
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (!cpus_empty(cpu_mask))
+                flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+        preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_mm);
+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        cpumask_t cpu_mask;
+        preempt_disable();
+        cpu_mask = mm->cpu_vm_mask;
+        cpu_clear(smp_processor_id(), cpu_mask);
+        if (current->active_mm == mm) {
+                if(current->mm)
+                        __flush_tlb_one(va);
+                 else
+                        leave_mm(smp_processor_id());
+        }
+        if (!cpus_empty(cpu_mask))
+                flush_tlb_others(cpu_mask, mm, va);
+        preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_page);
+static void do_flush_tlb_all(void* info)
+{
+        unsigned long cpu = smp_processor_id();
+        __flush_tlb_all();
+        if (read_pda(mmu_state) == TLBSTATE_LAZY)
+                leave_mm(cpu);
+}
+void flush_tlb_all(void)
+{
+        on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+}
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+void smp_send_reschedule(int cpu)
+{
+        send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+}
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+struct call_data_struct {
+        void (*func) (void *info);
+        void *info;
+        atomic_t started;
+        atomic_t finished;
+        int wait;
+};
+static struct call_data_struct * call_data;
+void lock_ipi_call_lock(void)
+{
+        spin_lock_irq(&call_lock);
+}
+void unlock_ipi_call_lock(void)
+{
+        spin_unlock_irq(&call_lock);
+}
+/*
+ * this function sends a 'generic call function' IPI to one other CPU
+ * in the system.
+ *
+ * cpu is a standard Linux logical CPU number.
+ */
+static void
+__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+                                int nonatomic, int wait)
+{
+        struct call_data_struct data;
+        int cpus = 1;
+        data.func = func;
+        data.info = info;
+        atomic_set(&data.started, 0);
+        data.wait = wait;
+        if (wait)
+                atomic_set(&data.finished, 0);
+        call_data = &data;
+        wmb();
+        /* Send a message to all other CPUs and wait for them to respond */
+        send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
+        /* Wait for response */
+        while (atomic_read(&data.started) != cpus)
+                cpu_relax();
+        if (!wait)
+                return;
+        while (atomic_read(&data.finished) != cpus)
+                cpu_relax();
+}
+/*
+ * smp_call_function_single - Run a function on a specific CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Currently unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Retrurns 0 on success, else a negative status code.
+ *
+ * Does not return until the remote CPU is nearly ready to execute <func>
+ * or is or has executed.
+ */
+int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
+        int nonatomic, int wait)
+{
+        /* prevent preemption and reschedule on another processor */
+        int me = get_cpu();
+        /* Can deadlock when called with interrupts disabled */
+        WARN_ON(irqs_disabled());
+        if (cpu == me) {
+                local_irq_disable();
+                func(info);
+                local_irq_enable();
+                put_cpu();
+                return 0;
+        }
+        spin_lock(&call_lock);
+        __smp_call_function_single(cpu, func, info, nonatomic, wait);
+        spin_unlock(&call_lock);
+        put_cpu();
+        return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
+/*
+ * this function sends a 'generic call function' IPI to all other CPUs
+ * in the system.
+ */
+static void __smp_call_function (void (*func) (void *info), void *info,
+                                int nonatomic, int wait)
+{
+        struct call_data_struct data;
+        int cpus = num_online_cpus()-1;
+        if (!cpus)
+                return;
+        data.func = func;
+        data.info = info;
+        atomic_set(&data.started, 0);
+        data.wait = wait;
+        if (wait)
+                atomic_set(&data.finished, 0);
+        call_data = &data;
+        wmb();
+        /* Send a message to all other CPUs and wait for them to respond */
+        send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+        /* Wait for response */
+        while (atomic_read(&data.started) != cpus)
+                cpu_relax();
+        if (!wait)
+                return;
+        while (atomic_read(&data.finished) != cpus)
+                cpu_relax();
+}
+/*
+ * smp_call_function - run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: currently unused.
+ * @wait: If true, wait (atomically) until function has completed on other
+ *        CPUs.
+ *
+ * Returns 0 on success, else a negative status code. Does not return until
+ * remote CPUs are nearly ready to execute func or are or have executed.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ * Actually there are a few legal cases, like panic.
+ */
+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+                        int wait)
+{
+        spin_lock(&call_lock);
+        __smp_call_function(func,info,nonatomic,wait);
+        spin_unlock(&call_lock);
+        return 0;
+}
+EXPORT_SYMBOL(smp_call_function);
+static void stop_this_cpu(void *dummy)
+{
+        local_irq_disable();
+        /*
+         * Remove this CPU:
+         */
+        cpu_clear(smp_processor_id(), cpu_online_map);
+        disable_local_APIC();
+        for (;;) 
+                halt();
+} 
+void smp_send_stop(void)
+{
+        int nolock;
+        unsigned long flags;
+        if (reboot_force)
+                return;
+        /* Don't deadlock on the call lock in panic */
+        nolock = !spin_trylock(&call_lock);
+        local_irq_save(flags);
+        __smp_call_function(stop_this_cpu, NULL, 0, 0);
+        if (!nolock)
+                spin_unlock(&call_lock);
+        disable_local_APIC();
+        local_irq_restore(flags);
+}
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+asmlinkage void smp_reschedule_interrupt(void)
+{
+        ack_APIC_irq();
+}
+asmlinkage void smp_call_function_interrupt(void)
+{
+        void (*func) (void *info) = call_data->func;
+        void *info = call_data->info;
+        int wait = call_data->wait;
+        ack_APIC_irq();
+        /*
+         * Notify initiating CPU that I've grabbed the data and am
+         * about to execute the function
+         */
+        mb();
+        atomic_inc(&call_data->started);
+        /*
+         * At this point the info structure may be out of scope unless wait==1
+         */
+        exit_idle();
+        irq_enter();
+        (*func)(info);
+        irq_exit();
+        if (wait) {
+                mb();
+                atomic_inc(&call_data->finished);
+        }
+}
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
new file mode 100644
index 000000000000..e4f61d1c6248
--- /dev/null
+++ b/arch/x86/kernel/smpboot_32.c
@@ -0,0 +1,1322 @@
+/*
+ *      x86 SMP booting functions
+ *
+ *      (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *      (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *      Much of the core SMP work is based on previous work by Thomas Radke, to
+ *      whom a great many thanks are extended.
+ *
+ *      Thanks to Intel for making available several different Pentium,
+ *      Pentium Pro and Pentium-II/Xeon MP machines.
+ *      Original development of Linux SMP code supported by Caldera.
+ *
+ *      This code is released under the GNU General Public License version 2 or
+ *      later.
+ *
+ *      Fixes
+ *              Felix Koop      :       NR_CPUS used properly
+ *              Jose Renau      :       Handle single CPU case.
+ *              Alan Cox        :       By repeated request 8) - Total BogoMIPS report.
+ *              Greg Wright     :       Fix for kernel stacks panic.
+ *              Erich Boleyn    :       MP v1.4 and additional changes.
+ *      Matthias Sattler        :       Changes for 2.1 kernel map.
+ *      Michel Lespinasse       :       Changes for 2.1 kernel map.
+ *      Michael Chastain        :       Change trampoline.S to gnu as.
+ *              Alan Cox        :       Dumb bug: 'B' step PPro's are fine
+ *              Ingo Molnar     :       Added APIC timers, based on code
+ *                                      from Jose Renau
+ *              Ingo Molnar     :       various cleanups and rewrites
+ *              Tigran Aivazian :       fixed "0.00 in /proc/uptime on SMP" bug.
+ *      Maciej W. Rozycki       :       Bits for genuine 82489DX APICs
+ *              Martin J. Bligh :       Added support for multi-quad systems
+ *              Dave Jones      :       Report invalid combinations of Athlon CPUs.
+*               Rusty Russell   :       Hacked into shape for new "hotplug" boot process. */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
+#include <linux/nmi.h>
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/arch_hooks.h>
+#include <asm/nmi.h>
+#include <mach_apic.h>
+#include <mach_wakecpu.h>
+#include <smpboot_hooks.h>
+#include <asm/vmi.h>
+#include <asm/mtrr.h>
+/* Set if we find a B stepping CPU */
+static int __devinitdata smp_b_stepping;
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+EXPORT_SYMBOL(smp_num_siblings);
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+/* representing HT siblings of each logical CPU */
+cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_sibling_map);
+/* representing HT and core siblings of each logical CPU */
+cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_core_map);
+/* bitmap of online cpus */
+cpumask_t cpu_online_map __read_mostly;
+EXPORT_SYMBOL(cpu_online_map);
+cpumask_t cpu_callin_map;
+cpumask_t cpu_callout_map;
+EXPORT_SYMBOL(cpu_callout_map);
+cpumask_t cpu_possible_map;
+EXPORT_SYMBOL(cpu_possible_map);
+static cpumask_t smp_commenced_mask;
+/* Per CPU bogomips and other parameters */
+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+EXPORT_SYMBOL(cpu_data);
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
+                        { [0 ... NR_CPUS-1] = 0xff };
+EXPORT_SYMBOL(x86_cpu_to_apicid);
+u8 apicid_2_node[MAX_APICID];
+/*
+ * Trampoline 80x86 program as an array.
+ */
+extern unsigned char trampoline_data [];
+extern unsigned char trampoline_end  [];
+static unsigned char *trampoline_base;
+static int trampoline_exec;
+static void map_cpu_to_logical_apicid(void);
+/* State of each CPU. */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+/*
+ * Currently trivial. Write the real->protected mode
+ * bootstrap into the page concerned. The caller
+ * has made sure it's suitably aligned.
+ */
+static unsigned long __devinit setup_trampoline(void)
+{
+        memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
+        return virt_to_phys(trampoline_base);
+}
+/*
+ * We are called very early to get the low memory for the
+ * SMP bootup trampoline page.
+ */
+void __init smp_alloc_memory(void)
+{
+        trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
+        /*
+         * Has to be in very low memory so we can execute
+         * real-mode AP code.
+         */
+        if (__pa(trampoline_base) >= 0x9F000)
+                BUG();
+        /*
+         * Make the SMP trampoline executable:
+         */
+        trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
+}
+/*
+ * The bootstrap kernel entry code has set these up. Save them for
+ * a given CPU
+ */
+void __cpuinit smp_store_cpu_info(int id)
+{
+        struct cpuinfo_x86 *c = cpu_data + id;
+        *c = boot_cpu_data;
+        if (id!=0)
+                identify_secondary_cpu(c);
+        /*
+         * Mask B, Pentium, but not Pentium MMX
+         */
+        if (c->x86_vendor == X86_VENDOR_INTEL &&
+            c->x86 == 5 &&
+            c->x86_mask >= 1 && c->x86_mask <= 4 &&
+            c->x86_model <= 3)
+                /*
+                 * Remember we have B step Pentia with bugs
+                 */
+                smp_b_stepping = 1;
+        /*
+         * Certain Athlons might work (for various values of 'work') in SMP
+         * but they are not certified as MP capable.
+         */
+        if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
+                if (num_possible_cpus() == 1)
+                        goto valid_k7;
+                /* Athlon 660/661 is valid. */  
+                if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
+                        goto valid_k7;
+                /* Duron 670 is valid */
+                if ((c->x86_model==7) && (c->x86_mask==0))
+                        goto valid_k7;
+                /*
+                 * Athlon 662, Duron 671, and Athlon >model 7 have capability bit.
+                 * It's worth noting that the A5 stepping (662) of some Athlon XP's
+                 * have the MP bit set.
+                 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more.
+                 */
+                if (((c->x86_model==6) && (c->x86_mask>=2)) ||
+                    ((c->x86_model==7) && (c->x86_mask>=1)) ||
+                     (c->x86_model> 7))
+                        if (cpu_has_mp)
+                                goto valid_k7;
+                /* If we get here, it's not a certified SMP capable AMD system. */
+                add_taint(TAINT_UNSAFE_SMP);
+        }
+valid_k7:
+        ;
+}
+extern void calibrate_delay(void);
+static atomic_t init_deasserted;
+static void __cpuinit smp_callin(void)
+{
+        int cpuid, phys_id;
+        unsigned long timeout;
+        /*
+         * If waken up by an INIT in an 82489DX configuration
+         * we may get here before an INIT-deassert IPI reaches
+         * our local APIC.  We have to wait for the IPI or we'll
+         * lock up on an APIC access.
+         */
+        wait_for_init_deassert(&init_deasserted);
+        /*
+         * (This works even if the APIC is not enabled.)
+         */
+        phys_id = GET_APIC_ID(apic_read(APIC_ID));
+        cpuid = smp_processor_id();
+        if (cpu_isset(cpuid, cpu_callin_map)) {
+                printk("huh, phys CPU#%d, CPU#%d already present??\n",
+                                        phys_id, cpuid);
+                BUG();
+        }
+        Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
+        /*
+         * STARTUP IPIs are fragile beasts as they might sometimes
+         * trigger some glue motherboard logic. Complete APIC bus
+         * silence for 1 second, this overestimates the time the
+         * boot CPU is spending to send the up to 2 STARTUP IPIs
+         * by a factor of two. This should be enough.
+         */
+        /*
+         * Waiting 2s total for startup (udelay is not yet working)
+         */
+        timeout = jiffies + 2*HZ;
+        while (time_before(jiffies, timeout)) {
+                /*
+                 * Has the boot CPU finished it's STARTUP sequence?
+                 */
+                if (cpu_isset(cpuid, cpu_callout_map))
+                        break;
+                rep_nop();
+        }
+        if (!time_before(jiffies, timeout)) {
+                printk("BUG: CPU%d started up but did not get a callout!\n",
+                        cpuid);
+                BUG();
+        }
+        /*
+         * the boot CPU has finished the init stage and is spinning
+         * on callin_map until we finish. We are free to set up this
+         * CPU, first the APIC. (this is probably redundant on most
+         * boards)
+         */
+        Dprintk("CALLIN, before setup_local_APIC().\n");
+        smp_callin_clear_local_apic();
+        setup_local_APIC();
+        map_cpu_to_logical_apicid();
+        /*
+         * Get our bogomips.
+         */
+        calibrate_delay();
+        Dprintk("Stack at about %p\n",&cpuid);
+        /*
+         * Save our processor parameters
+         */
+        smp_store_cpu_info(cpuid);
+        /*
+         * Allow the master to continue.
+         */
+        cpu_set(cpuid, cpu_callin_map);
+}
+static int cpucount;
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+        struct cpuinfo_x86 *c = cpu_data + cpu;
+        /*
+         * For perf, we return last level cache shared map.
+         * And for power savings, we return cpu_core_map
+         */
+        if (sched_mc_power_savings || sched_smt_power_savings)
+                return cpu_core_map[cpu];
+        else
+                return c->llc_shared_map;
+}
+/* representing cpus for which sibling maps can be computed */
+static cpumask_t cpu_sibling_setup_map;
+void __cpuinit set_cpu_sibling_map(int cpu)
+{
+        int i;
+        struct cpuinfo_x86 *c = cpu_data;
+        cpu_set(cpu, cpu_sibling_setup_map);
+        if (smp_num_siblings > 1) {
+                for_each_cpu_mask(i, cpu_sibling_setup_map) {
+                        if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+                            c[cpu].cpu_core_id == c[i].cpu_core_id) {
+                                cpu_set(i, cpu_sibling_map[cpu]);
+                                cpu_set(cpu, cpu_sibling_map[i]);
+                                cpu_set(i, cpu_core_map[cpu]);
+                                cpu_set(cpu, cpu_core_map[i]);
+                                cpu_set(i, c[cpu].llc_shared_map);
+                                cpu_set(cpu, c[i].llc_shared_map);
+                        }
+                }
+        } else {
+                cpu_set(cpu, cpu_sibling_map[cpu]);
+        }
+        cpu_set(cpu, c[cpu].llc_shared_map);
+        if (current_cpu_data.x86_max_cores == 1) {
+                cpu_core_map[cpu] = cpu_sibling_map[cpu];
+                c[cpu].booted_cores = 1;
+                return;
+        }
+        for_each_cpu_mask(i, cpu_sibling_setup_map) {
+                if (cpu_llc_id[cpu] != BAD_APICID &&
+                    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+                        cpu_set(i, c[cpu].llc_shared_map);
+                        cpu_set(cpu, c[i].llc_shared_map);
+                }
+                if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
+                        cpu_set(i, cpu_core_map[cpu]);
+                        cpu_set(cpu, cpu_core_map[i]);
+                        /*
+                         *  Does this new cpu bringup a new core?
+                         */
+                        if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+                                /*
+                                 * for each core in package, increment
+                                 * the booted_cores for this new cpu
+                                 */
+                                if (first_cpu(cpu_sibling_map[i]) == i)
+                                        c[cpu].booted_cores++;
+                                /*
+                                 * increment the core count for all
+                                 * the other cpus in this package
+                                 */
+                                if (i != cpu)
+                                        c[i].booted_cores++;
+                        } else if (i != cpu && !c[cpu].booted_cores)
+                                c[cpu].booted_cores = c[i].booted_cores;
+                }
+        }
+}
+/*
+ * Activate a secondary processor.
+ */
+static void __cpuinit start_secondary(void *unused)
+{
+        /*
+         * Don't put *anything* before cpu_init(), SMP booting is too
+         * fragile that we want to limit the things done here to the
+         * most necessary things.
+         */
+#ifdef CONFIG_VMI
+        vmi_bringup();
+#endif
+        cpu_init();
+        preempt_disable();
+        smp_callin();
+        while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
+                rep_nop();
+        /*
+         * Check TSC synchronization with the BP:
+         */
+        check_tsc_sync_target();
+        setup_secondary_clock();
+        if (nmi_watchdog == NMI_IO_APIC) {
+                disable_8259A_irq(0);
+                enable_NMI_through_LVT0(NULL);
+                enable_8259A_irq(0);
+        }
+        /*
+         * low-memory mappings have been cleared, flush them from
+         * the local TLBs too.
+         */
+        local_flush_tlb();
+        /* This must be done before setting cpu_online_map */
+        set_cpu_sibling_map(raw_smp_processor_id());
+        wmb();
+        /*
+         * We need to hold call_lock, so there is no inconsistency
+         * between the time smp_call_function() determines number of
+         * IPI receipients, and the time when the determination is made
+         * for which cpus receive the IPI. Holding this
+         * lock helps us to not include this cpu in a currently in progress
+         * smp_call_function().
+         */
+        lock_ipi_call_lock();
+        cpu_set(smp_processor_id(), cpu_online_map);
+        unlock_ipi_call_lock();
+        per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+        /* We can take interrupts now: we're officially "up". */
+        local_irq_enable();
+        wmb();
+        cpu_idle();
+}
+/*
+ * Everything has been set up for the secondary
+ * CPUs - they just need to reload everything
+ * from the task structure
+ * This function must not return.
+ */
+void __devinit initialize_secondary(void)
+{
+        /*
+         * We don't actually need to load the full TSS,
+         * basically just the stack pointer and the eip.
+         */
+        asm volatile(
+                "movl %0,%%esp\n\t"
+                "jmp *%1"
+                :
+                :"m" (current->thread.esp),"m" (current->thread.eip));
+}
+/* Static state in head.S used to set up a CPU */
+extern struct {
+        void * esp;
+        unsigned short ss;
+} stack_start;
+#ifdef CONFIG_NUMA
+/* which logical CPUs are on which nodes */
+cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
+                                { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
+EXPORT_SYMBOL(node_2_cpu_mask);
+/* which node each logical CPU is on */
+int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
+EXPORT_SYMBOL(cpu_2_node);
+/* set up a mapping between cpu and node. */
+static inline void map_cpu_to_node(int cpu, int node)
+{
+        printk("Mapping cpu %d to node %d\n", cpu, node);
+        cpu_set(cpu, node_2_cpu_mask[node]);
+        cpu_2_node[cpu] = node;
+}
+/* undo a mapping between cpu and node. */
+static inline void unmap_cpu_to_node(int cpu)
+{
+        int node;
+        printk("Unmapping cpu %d from all nodes\n", cpu);
+        for (node = 0; node < MAX_NUMNODES; node ++)
+                cpu_clear(cpu, node_2_cpu_mask[node]);
+        cpu_2_node[cpu] = 0;
+}
+#else /* !CONFIG_NUMA */
+#define map_cpu_to_node(cpu, node)      ({})
+#define unmap_cpu_to_node(cpu)  ({})
+#endif /* CONFIG_NUMA */
+u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+static void map_cpu_to_logical_apicid(void)
+{
+        int cpu = smp_processor_id();
+        int apicid = logical_smp_processor_id();
+        int node = apicid_to_node(apicid);
+        if (!node_online(node))
+                node = first_online_node;
+        cpu_2_logical_apicid[cpu] = apicid;
+        map_cpu_to_node(cpu, node);
+}
+static void unmap_cpu_to_logical_apicid(int cpu)
+{
+        cpu_2_logical_apicid[cpu] = BAD_APICID;
+        unmap_cpu_to_node(cpu);
+}
+static inline void __inquire_remote_apic(int apicid)
+{
+        int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
+        char *names[] = { "ID", "VERSION", "SPIV" };
+        int timeout;
+        unsigned long status;
+        printk("Inquiring remote APIC #%d...\n", apicid);
+        for (i = 0; i < ARRAY_SIZE(regs); i++) {
+                printk("... APIC #%d %s: ", apicid, names[i]);
+                /*
+                 * Wait for idle.
+                 */
+                status = safe_apic_wait_icr_idle();
+                if (status)
+                        printk("a previous APIC delivery may have failed\n");
+                apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
+                apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
+                timeout = 0;
+                do {
+                        udelay(100);
+                        status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
+                } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
+                switch (status) {
+                case APIC_ICR_RR_VALID:
+                        status = apic_read(APIC_RRR);
+                        printk("%lx\n", status);
+                        break;
+                default:
+                        printk("failed\n");
+                }
+        }
+}
+#ifdef WAKE_SECONDARY_VIA_NMI
+/* 
+ * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
+ * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
+ * won't ... remember to clear down the APIC, etc later.
+ */
+static int __devinit
+wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+{
+        unsigned long send_status, accept_status = 0;
+        int maxlvt;
+        /* Target chip */
+        apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
+        /* Boot on the stack */
+        /* Kick the second */
+        apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
+        Dprintk("Waiting for send to finish...\n");
+        send_status = safe_apic_wait_icr_idle();
+        /*
+         * Give the other CPU some time to accept the IPI.
+         */
+        udelay(200);
+        /*
+         * Due to the Pentium erratum 3AP.
+         */
+        maxlvt = lapic_get_maxlvt();
+        if (maxlvt > 3) {
+                apic_read_around(APIC_SPIV);
+                apic_write(APIC_ESR, 0);
+        }
+        accept_status = (apic_read(APIC_ESR) & 0xEF);
+        Dprintk("NMI sent.\n");
+        if (send_status)
+                printk("APIC never delivered???\n");
+        if (accept_status)
+                printk("APIC delivery error (%lx).\n", accept_status);
+        return (send_status | accept_status);
+}
+#endif  /* WAKE_SECONDARY_VIA_NMI */
+#ifdef WAKE_SECONDARY_VIA_INIT
+static int __devinit
+wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+{
+        unsigned long send_status, accept_status = 0;
+        int maxlvt, num_starts, j;
+        /*
+         * Be paranoid about clearing APIC errors.
+         */
+        if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+                apic_read_around(APIC_SPIV);
+                apic_write(APIC_ESR, 0);
+                apic_read(APIC_ESR);
+        }
+        Dprintk("Asserting INIT.\n");
+        /*
+         * Turn INIT on target chip
+         */
+        apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+        /*
+         * Send IPI
+         */
+        apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
+                                | APIC_DM_INIT);
+        Dprintk("Waiting for send to finish...\n");
+        send_status = safe_apic_wait_icr_idle();
+        mdelay(10);
+        Dprintk("Deasserting INIT.\n");
+        /* Target chip */
+        apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+        /* Send IPI */
+        apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+        Dprintk("Waiting for send to finish...\n");
+        send_status = safe_apic_wait_icr_idle();
+        atomic_set(&init_deasserted, 1);
+        /*
+         * Should we send STARTUP IPIs ?
+         *
+         * Determine this based on the APIC version.
+         * If we don't have an integrated APIC, don't send the STARTUP IPIs.
+         */
+        if (APIC_INTEGRATED(apic_version[phys_apicid]))
+                num_starts = 2;
+        else
+                num_starts = 0;
+        /*
+         * Paravirt / VMI wants a startup IPI hook here to set up the
+         * target processor state.
+         */
+        startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
+                         (unsigned long) stack_start.esp);
+        /*
+         * Run STARTUP IPI loop.
+         */
+        Dprintk("#startup loops: %d.\n", num_starts);
+        maxlvt = lapic_get_maxlvt();
+        for (j = 1; j <= num_starts; j++) {
+                Dprintk("Sending STARTUP #%d.\n",j);
+                apic_read_around(APIC_SPIV);
+                apic_write(APIC_ESR, 0);
+                apic_read(APIC_ESR);
+                Dprintk("After apic_write.\n");
+                /*
+                 * STARTUP IPI
+                 */
+                /* Target chip */
+                apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+                /* Boot on the stack */
+                /* Kick the second */
+                apic_write_around(APIC_ICR, APIC_DM_STARTUP
+                                        | (start_eip >> 12));
+                /*
+                 * Give the other CPU some time to accept the IPI.
+                 */
+                udelay(300);
+                Dprintk("Startup point 1.\n");
+                Dprintk("Waiting for send to finish...\n");
+                send_status = safe_apic_wait_icr_idle();
+                /*
+                 * Give the other CPU some time to accept the IPI.
+                 */
+                udelay(200);
+                /*
+                 * Due to the Pentium erratum 3AP.
+                 */
+                if (maxlvt > 3) {
+                        apic_read_around(APIC_SPIV);
+                        apic_write(APIC_ESR, 0);
+                }
+                accept_status = (apic_read(APIC_ESR) & 0xEF);
+                if (send_status || accept_status)
+                        break;
+        }
+        Dprintk("After Startup.\n");
+        if (send_status)
+                printk("APIC never delivered???\n");
+        if (accept_status)
+                printk("APIC delivery error (%lx).\n", accept_status);
+        return (send_status | accept_status);
+}
+#endif  /* WAKE_SECONDARY_VIA_INIT */
+extern cpumask_t cpu_initialized;
+static inline int alloc_cpu_id(void)
+{
+        cpumask_t       tmp_map;
+        int cpu;
+        cpus_complement(tmp_map, cpu_present_map);
+        cpu = first_cpu(tmp_map);
+        if (cpu >= NR_CPUS)
+                return -ENODEV;
+        return cpu;
+}
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS];
+static inline struct task_struct * alloc_idle_task(int cpu)
+{
+        struct task_struct *idle;
+        if ((idle = cpu_idle_tasks[cpu]) != NULL) {
+                /* initialize thread_struct.  we really want to avoid destroy
+                 * idle tread
+                 */
+                idle->thread.esp = (unsigned long)task_pt_regs(idle);
+                init_idle(idle, cpu);
+                return idle;
+        }
+        idle = fork_idle(cpu);
+        if (!IS_ERR(idle))
+                cpu_idle_tasks[cpu] = idle;
+        return idle;
+}
+#else
+#define alloc_idle_task(cpu) fork_idle(cpu)
+#endif
+static int __cpuinit do_boot_cpu(int apicid, int cpu)
+/*
+ * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
+ * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
+ * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
+ */
+{
+        struct task_struct *idle;
+        unsigned long boot_error;
+        int timeout;
+        unsigned long start_eip;
+        unsigned short nmi_high = 0, nmi_low = 0;
+        /*
+         * Save current MTRR state in case it was changed since early boot
+         * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+         */
+        mtrr_save_state();
+        /*
+         * We can't use kernel_thread since we must avoid to
+         * reschedule the child.
+         */
+        idle = alloc_idle_task(cpu);
+        if (IS_ERR(idle))
+                panic("failed fork for CPU %d", cpu);
+        init_gdt(cpu);
+        per_cpu(current_task, cpu) = idle;
+        early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+        idle->thread.eip = (unsigned long) start_secondary;
+        /* start_eip had better be page-aligned! */
+        start_eip = setup_trampoline();
+        ++cpucount;
+        alternatives_smp_switch(1);
+        /* So we see what's up   */
+        printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
+        /* Stack for startup_32 can be just as for start_secondary onwards */
+        stack_start.esp = (void *) idle->thread.esp;
+        irq_ctx_init(cpu);
+        x86_cpu_to_apicid[cpu] = apicid;
+        /*
+         * This grunge runs the startup process for
+         * the targeted processor.
+         */
+        atomic_set(&init_deasserted, 0);
+        Dprintk("Setting warm reset code and vector.\n");
+        store_NMI_vector(&nmi_high, &nmi_low);
+        smpboot_setup_warm_reset_vector(start_eip);
+        /*
+         * Starting actual IPI sequence...
+         */
+        boot_error = wakeup_secondary_cpu(apicid, start_eip);
+        if (!boot_error) {
+                /*
+                 * allow APs to start initializing.
+                 */
+                Dprintk("Before Callout %d.\n", cpu);
+                cpu_set(cpu, cpu_callout_map);
+                Dprintk("After Callout %d.\n", cpu);
+                /*
+                 * Wait 5s total for a response
+                 */
+                for (timeout = 0; timeout < 50000; timeout++) {
+                        if (cpu_isset(cpu, cpu_callin_map))
+                                break;  /* It has booted */
+                        udelay(100);
+                }
+                if (cpu_isset(cpu, cpu_callin_map)) {
+                        /* number CPUs logically, starting from 1 (BSP is 0) */
+                        Dprintk("OK.\n");
+                        printk("CPU%d: ", cpu);
+                        print_cpu_info(&cpu_data[cpu]);
+                        Dprintk("CPU has booted.\n");
+                } else {
+                        boot_error= 1;
+                        if (*((volatile unsigned char *)trampoline_base)
+                                        == 0xA5)
+                                /* trampoline started but...? */
+                                printk("Stuck ??\n");
+                        else
+                                /* trampoline code not run */
+                                printk("Not responding.\n");
+                        inquire_remote_apic(apicid);
+                }
+        }
+        if (boot_error) {
+                /* Try to put things back the way they were before ... */
+                unmap_cpu_to_logical_apicid(cpu);
+                cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
+                cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
+                cpucount--;
+        } else {
+                x86_cpu_to_apicid[cpu] = apicid;
+                cpu_set(cpu, cpu_present_map);
+        }
+        /* mark "stuck" area as not stuck */
+        *((volatile unsigned long *)trampoline_base) = 0;
+        return boot_error;
+}
+#ifdef CONFIG_HOTPLUG_CPU
+void cpu_exit_clear(void)
+{
+        int cpu = raw_smp_processor_id();
+        idle_task_exit();
+        cpucount --;
+        cpu_uninit();
+        irq_ctx_exit(cpu);
+        cpu_clear(cpu, cpu_callout_map);
+        cpu_clear(cpu, cpu_callin_map);
+        cpu_clear(cpu, smp_commenced_mask);
+        unmap_cpu_to_logical_apicid(cpu);
+}
+struct warm_boot_cpu_info {
+        struct completion *complete;
+        struct work_struct task;
+        int apicid;
+        int cpu;
+};
+static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
+{
+        struct warm_boot_cpu_info *info =
+                container_of(work, struct warm_boot_cpu_info, task);
+        do_boot_cpu(info->apicid, info->cpu);
+        complete(info->complete);
+}
+static int __cpuinit __smp_prepare_cpu(int cpu)
+{
+        DECLARE_COMPLETION_ONSTACK(done);
+        struct warm_boot_cpu_info info;
+        int     apicid, ret;
+        apicid = x86_cpu_to_apicid[cpu];
+        if (apicid == BAD_APICID) {
+                ret = -ENODEV;
+                goto exit;
+        }
+        info.complete = &done;
+        info.apicid = apicid;
+        info.cpu = cpu;
+        INIT_WORK(&info.task, do_warm_boot_cpu);
+        /* init low mem mapping */
+        clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+                        min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
+        flush_tlb_all();
+        schedule_work(&info.task);
+        wait_for_completion(&done);
+        zap_low_mappings();
+        ret = 0;
+exit:
+        return ret;
+}
+#endif
+/*
+ * Cycle through the processors sending APIC IPIs to boot each.
+ */
+static int boot_cpu_logical_apicid;
+/* Where the IO area was mapped on multiquad, always 0 otherwise */
+void *xquad_portio;
+#ifdef CONFIG_X86_NUMAQ
+EXPORT_SYMBOL(xquad_portio);
+#endif
+static void __init smp_boot_cpus(unsigned int max_cpus)
+{
+        int apicid, cpu, bit, kicked;
+        unsigned long bogosum = 0;
+        /*
+         * Setup boot CPU information
+         */
+        smp_store_cpu_info(0); /* Final full version of the data */
+        printk("CPU%d: ", 0);
+        print_cpu_info(&cpu_data[0]);
+        boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+        boot_cpu_logical_apicid = logical_smp_processor_id();
+        x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+        current_thread_info()->cpu = 0;
+        set_cpu_sibling_map(0);
+        /*
+         * If we couldn't find an SMP configuration at boot time,
+         * get out of here now!
+         */
+        if (!smp_found_config && !acpi_lapic) {
+                printk(KERN_NOTICE "SMP motherboard not detected.\n");
+                smpboot_clear_io_apic_irqs();
+                phys_cpu_present_map = physid_mask_of_physid(0);
+                if (APIC_init_uniprocessor())
+                        printk(KERN_NOTICE "Local APIC not detected."
+                                           " Using dummy APIC emulation.\n");
+                map_cpu_to_logical_apicid();
+                cpu_set(0, cpu_sibling_map[0]);
+                cpu_set(0, cpu_core_map[0]);
+                return;
+        }
+        /*
+         * Should not be necessary because the MP table should list the boot
+         * CPU too, but we do it for the sake of robustness anyway.
+         * Makes no sense to do this check in clustered apic mode, so skip it
+         */
+        if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
+                printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
+                                boot_cpu_physical_apicid);
+                physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+        }
+        /*
+         * If we couldn't find a local APIC, then get out of here now!
+         */
+        if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) {
+                printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+                        boot_cpu_physical_apicid);
+                printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
+                smpboot_clear_io_apic_irqs();
+                phys_cpu_present_map = physid_mask_of_physid(0);
+                cpu_set(0, cpu_sibling_map[0]);
+                cpu_set(0, cpu_core_map[0]);
+                return;
+        }
+        verify_local_APIC();
+        /*
+         * If SMP should be disabled, then really disable it!
+         */
+        if (!max_cpus) {
+                smp_found_config = 0;
+                printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
+                smpboot_clear_io_apic_irqs();
+                phys_cpu_present_map = physid_mask_of_physid(0);
+                cpu_set(0, cpu_sibling_map[0]);
+                cpu_set(0, cpu_core_map[0]);
+                return;
+        }
+        connect_bsp_APIC();
+        setup_local_APIC();
+        map_cpu_to_logical_apicid();
+        setup_portio_remap();
+        /*
+         * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
+         *
+         * In clustered apic mode, phys_cpu_present_map is a constructed thus:
+         * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the 
+         * clustered apic ID.
+         */
+        Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
+        kicked = 1;
+        for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
+                apicid = cpu_present_to_apicid(bit);
+                /*
+                 * Don't even attempt to start the boot CPU!
+                 */
+                if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID))
+                        continue;
+                if (!check_apicid_present(bit))
+                        continue;
+                if (max_cpus <= cpucount+1)
+                        continue;
+                if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
+                        printk("CPU #%d not responding - cannot use it.\n",
+                                                                apicid);
+                else
+                        ++kicked;
+        }
+        /*
+         * Cleanup possible dangling ends...
+         */
+        smpboot_restore_warm_reset_vector();
+        /*
+         * Allow the user to impress friends.
+         */
+        Dprintk("Before bogomips.\n");
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+                if (cpu_isset(cpu, cpu_callout_map))
+                        bogosum += cpu_data[cpu].loops_per_jiffy;
+        printk(KERN_INFO
+                "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+                cpucount+1,
+                bogosum/(500000/HZ),
+                (bogosum/(5000/HZ))%100);
+        
+        Dprintk("Before bogocount - setting activated=1.\n");
+        if (smp_b_stepping)
+                printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
+        /*
+         * Don't taint if we are running SMP kernel on a single non-MP
+         * approved Athlon
+         */
+        if (tainted & TAINT_UNSAFE_SMP) {
+                if (cpucount)
+                        printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n");
+                else
+                        tainted &= ~TAINT_UNSAFE_SMP;
+        }
+        Dprintk("Boot done.\n");
+        /*
+         * construct cpu_sibling_map[], so that we can tell sibling CPUs
+         * efficiently.
+         */
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                cpus_clear(cpu_sibling_map[cpu]);
+                cpus_clear(cpu_core_map[cpu]);
+        }
+        cpu_set(0, cpu_sibling_map[0]);
+        cpu_set(0, cpu_core_map[0]);
+        smpboot_setup_io_apic();
+        setup_boot_clock();
+}
+/* These are wrappers to interface to the new boot process.  Someone
+   who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
+void __init native_smp_prepare_cpus(unsigned int max_cpus)
+{
+        smp_commenced_mask = cpumask_of_cpu(0);
+        cpu_callin_map = cpumask_of_cpu(0);
+        mb();
+        smp_boot_cpus(max_cpus);
+}
+void __init native_smp_prepare_boot_cpu(void)
+{
+        unsigned int cpu = smp_processor_id();
+        init_gdt(cpu);
+        switch_to_new_gdt();
+        cpu_set(cpu, cpu_online_map);
+        cpu_set(cpu, cpu_callout_map);
+        cpu_set(cpu, cpu_present_map);
+        cpu_set(cpu, cpu_possible_map);
+        __get_cpu_var(cpu_state) = CPU_ONLINE;
+}
+#ifdef CONFIG_HOTPLUG_CPU
+void remove_siblinginfo(int cpu)
+{
+        int sibling;
+        struct cpuinfo_x86 *c = cpu_data;
+        for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+                cpu_clear(cpu, cpu_core_map[sibling]);
+                /*
+                 * last thread sibling in this cpu core going down
+                 */
+                if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+                        c[sibling].booted_cores--;
+        }
+                        
+        for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+                cpu_clear(cpu, cpu_sibling_map[sibling]);
+        cpus_clear(cpu_sibling_map[cpu]);
+        cpus_clear(cpu_core_map[cpu]);
+        c[cpu].phys_proc_id = 0;
+        c[cpu].cpu_core_id = 0;
+        cpu_clear(cpu, cpu_sibling_setup_map);
+}
+int __cpu_disable(void)
+{
+        cpumask_t map = cpu_online_map;
+        int cpu = smp_processor_id();
+        /*
+         * Perhaps use cpufreq to drop frequency, but that could go
+         * into generic code.
+         *
+         * We won't take down the boot processor on i386 due to some
+         * interrupts only being able to be serviced by the BSP.
+         * Especially so if we're not using an IOAPIC   -zwane
+         */
+        if (cpu == 0)
+                return -EBUSY;
+        if (nmi_watchdog == NMI_LOCAL_APIC)
+                stop_apic_nmi_watchdog(NULL);
+        clear_local_APIC();
+        /* Allow any queued timer interrupts to get serviced */
+        local_irq_enable();
+        mdelay(1);
+        local_irq_disable();
+        remove_siblinginfo(cpu);
+        cpu_clear(cpu, map);
+        fixup_irqs(map);
+        /* It's now safe to remove this processor from the online map */
+        cpu_clear(cpu, cpu_online_map);
+        return 0;
+}
+void __cpu_die(unsigned int cpu)
+{
+        /* We don't do anything here: idle task is faking death itself. */
+        unsigned int i;
+        for (i = 0; i < 10; i++) {
+                /* They ack this in play_dead by setting CPU_DEAD */
+                if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+                        printk ("CPU %d is now offline\n", cpu);
+                        if (1 == num_online_cpus())
+                                alternatives_smp_switch(0);
+                        return;
+                }
+                msleep(100);
+        }
+        printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+#else /* ... !CONFIG_HOTPLUG_CPU */
+int __cpu_disable(void)
+{
+        return -ENOSYS;
+}
+void __cpu_die(unsigned int cpu)
+{
+        /* We said "no" in __cpu_disable */
+        BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+int __cpuinit native_cpu_up(unsigned int cpu)
+{
+        unsigned long flags;
+#ifdef CONFIG_HOTPLUG_CPU
+        int ret = 0;
+        /*
+         * We do warm boot only on cpus that had booted earlier
+         * Otherwise cold boot is all handled from smp_boot_cpus().
+         * cpu_callin_map is set during AP kickstart process. Its reset
+         * when a cpu is taken offline from cpu_exit_clear().
+         */
+        if (!cpu_isset(cpu, cpu_callin_map))
+                ret = __smp_prepare_cpu(cpu);
+        if (ret)
+                return -EIO;
+#endif
+        /* In case one didn't come up */
+        if (!cpu_isset(cpu, cpu_callin_map)) {
+                printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
+                return -EIO;
+        }
+        per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+        /* Unleash the CPU! */
+        cpu_set(cpu, smp_commenced_mask);
+        /*
+         * Check TSC synchronization with the AP (keep irqs disabled
+         * while doing so):
+         */
+        local_irq_save(flags);
+        check_tsc_sync_source(cpu);
+        local_irq_restore(flags);
+        while (!cpu_isset(cpu, cpu_online_map)) {
+                cpu_relax();
+                touch_nmi_watchdog();
+        }
+        return 0;
+}
+void __init native_smp_cpus_done(unsigned int max_cpus)
+{
+#ifdef CONFIG_X86_IO_APIC
+        setup_ioapic_dest();
+#endif
+        zap_low_mappings();
+#ifndef CONFIG_HOTPLUG_CPU
+        /*
+         * Disable executability of the SMP trampoline:
+         */
+        set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
+#endif
+}
+void __init smp_intr_init(void)
+{
+        /*
+         * IRQ0 must be given a fixed assignment and initialized,
+         * because it's used before the IO-APIC is set up.
+         */
+        set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+        /*
+         * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+         * IPI, driven by wakeup.
+         */
+        set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+        /* IPI for invalidation */
+        set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+        /* IPI for generic function call */
+        set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+}
+/*
+ * If the BIOS enumerates physical processors before logical,
+ * maxcpus=N at enumeration-time can be used to disable HT.
+ */
+static int __init parse_maxcpus(char *arg)
+{
+        extern unsigned int maxcpus;
+        maxcpus = simple_strtoul(arg, NULL, 0);
+        return 0;
+}
+early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
new file mode 100644
index 000000000000..32f50783edc8
--- /dev/null
+++ b/arch/x86/kernel/smpboot_64.c
@@ -0,0 +1,1085 @@
+/*
+ *      x86 SMP booting functions
+ *
+ *      (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *      (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *      Copyright 2001 Andi Kleen, SuSE Labs.
+ *
+ *      Much of the core SMP work is based on previous work by Thomas Radke, to
+ *      whom a great many thanks are extended.
+ *
+ *      Thanks to Intel for making available several different Pentium,
+ *      Pentium Pro and Pentium-II/Xeon MP machines.
+ *      Original development of Linux SMP code supported by Caldera.
+ *
+ *      This code is released under the GNU General Public License version 2
+ *
+ *      Fixes
+ *              Felix Koop      :       NR_CPUS used properly
+ *              Jose Renau      :       Handle single CPU case.
+ *              Alan Cox        :       By repeated request 8) - Total BogoMIP report.
+ *              Greg Wright     :       Fix for kernel stacks panic.
+ *              Erich Boleyn    :       MP v1.4 and additional changes.
+ *      Matthias Sattler        :       Changes for 2.1 kernel map.
+ *      Michel Lespinasse       :       Changes for 2.1 kernel map.
+ *      Michael Chastain        :       Change trampoline.S to gnu as.
+ *              Alan Cox        :       Dumb bug: 'B' step PPro's are fine
+ *              Ingo Molnar     :       Added APIC timers, based on code
+ *                                      from Jose Renau
+ *              Ingo Molnar     :       various cleanups and rewrites
+ *              Tigran Aivazian :       fixed "0.00 in /proc/uptime on SMP" bug.
+ *      Maciej W. Rozycki       :       Bits for genuine 82489DX APICs
+ *      Andi Kleen              :       Changed for SMP boot into long mode.
+ *              Rusty Russell   :       Hacked into shape for new "hotplug" boot process.
+ *      Andi Kleen              :       Converted to new state machine.
+ *                                      Various cleanups.
+ *                                      Probably mostly hotplug CPU ready now.
+ *      Ashok Raj                       : CPU hotplug support
+ */
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/bootmem.h>
+#include <linux/thread_info.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>
+#include <linux/smp.h>
+#include <linux/kdebug.h>
+#include <asm/mtrr.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm/nmi.h>
+#include <asm/irq.h>
+#include <asm/hw_irq.h>
+#include <asm/numa.h>
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+EXPORT_SYMBOL(smp_num_siblings);
+/* Last level cache ID of each logical CPU */
+u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
+/* Bitmask of currently online CPUs */
+cpumask_t cpu_online_map __read_mostly;
+EXPORT_SYMBOL(cpu_online_map);
+/*
+ * Private maps to synchronize booting between AP and BP.
+ * Probably not needed anymore, but it makes for easier debugging. -AK
+ */
+cpumask_t cpu_callin_map;
+cpumask_t cpu_callout_map;
+EXPORT_SYMBOL(cpu_callout_map);
+cpumask_t cpu_possible_map;
+EXPORT_SYMBOL(cpu_possible_map);
+/* Per CPU bogomips and other parameters */
+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+EXPORT_SYMBOL(cpu_data);
+/* Set when the idlers are all forked */
+int smp_threads_ready;
+/* representing HT siblings of each logical CPU */
+cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_sibling_map);
+/* representing HT and core siblings of each logical CPU */
+cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_core_map);
+/*
+ * Trampoline 80x86 program as an array.
+ */
+extern unsigned char trampoline_data[];
+extern unsigned char trampoline_end[];
+/* State of each CPU */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+/*
+ * Store all idle threads, this can be reused instead of creating
+ * a new thread. Also avoids complicated thread destroy functionality
+ * for idle threads.
+ */
+struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+#define get_idle_for_cpu(x)     (idle_thread_array[(x)])
+#define set_idle_for_cpu(x,p)   (idle_thread_array[(x)] = (p))
+/*
+ * Currently trivial. Write the real->protected mode
+ * bootstrap into the page concerned. The caller
+ * has made sure it's suitably aligned.
+ */
+static unsigned long __cpuinit setup_trampoline(void)
+{
+        void *tramp = __va(SMP_TRAMPOLINE_BASE); 
+        memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
+        return virt_to_phys(tramp);
+}
+/*
+ * The bootstrap kernel entry code has set these up. Save them for
+ * a given CPU
+ */
+static void __cpuinit smp_store_cpu_info(int id)
+{
+        struct cpuinfo_x86 *c = cpu_data + id;
+        *c = boot_cpu_data;
+        identify_cpu(c);
+        print_cpu_info(c);
+}
+static atomic_t init_deasserted __cpuinitdata;
+/*
+ * Report back to the Boot Processor.
+ * Running on AP.
+ */
+void __cpuinit smp_callin(void)
+{
+        int cpuid, phys_id;
+        unsigned long timeout;
+        /*
+         * If waken up by an INIT in an 82489DX configuration
+         * we may get here before an INIT-deassert IPI reaches
+         * our local APIC.  We have to wait for the IPI or we'll
+         * lock up on an APIC access.
+         */
+        while (!atomic_read(&init_deasserted))
+                cpu_relax();
+        /*
+         * (This works even if the APIC is not enabled.)
+         */
+        phys_id = GET_APIC_ID(apic_read(APIC_ID));
+        cpuid = smp_processor_id();
+        if (cpu_isset(cpuid, cpu_callin_map)) {
+                panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
+                                        phys_id, cpuid);
+        }
+        Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
+        /*
+         * STARTUP IPIs are fragile beasts as they might sometimes
+         * trigger some glue motherboard logic. Complete APIC bus
+         * silence for 1 second, this overestimates the time the
+         * boot CPU is spending to send the up to 2 STARTUP IPIs
+         * by a factor of two. This should be enough.
+         */
+        /*
+         * Waiting 2s total for startup (udelay is not yet working)
+         */
+        timeout = jiffies + 2*HZ;
+        while (time_before(jiffies, timeout)) {
+                /*
+                 * Has the boot CPU finished it's STARTUP sequence?
+                 */
+                if (cpu_isset(cpuid, cpu_callout_map))
+                        break;
+                cpu_relax();
+        }
+        if (!time_before(jiffies, timeout)) {
+                panic("smp_callin: CPU%d started up but did not get a callout!\n",
+                        cpuid);
+        }
+        /*
+         * the boot CPU has finished the init stage and is spinning
+         * on callin_map until we finish. We are free to set up this
+         * CPU, first the APIC. (this is probably redundant on most
+         * boards)
+         */
+        Dprintk("CALLIN, before setup_local_APIC().\n");
+        setup_local_APIC();
+        /*
+         * Get our bogomips.
+         *
+         * Need to enable IRQs because it can take longer and then
+         * the NMI watchdog might kill us.
+         */
+        local_irq_enable();
+        calibrate_delay();
+        local_irq_disable();
+        Dprintk("Stack at about %p\n",&cpuid);
+        disable_APIC_timer();
+        /*
+         * Save our processor parameters
+         */
+        smp_store_cpu_info(cpuid);
+        /*
+         * Allow the master to continue.
+         */
+        cpu_set(cpuid, cpu_callin_map);
+}
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+        struct cpuinfo_x86 *c = cpu_data + cpu;
+        /*
+         * For perf, we return last level cache shared map.
+         * And for power savings, we return cpu_core_map
+         */
+        if (sched_mc_power_savings || sched_smt_power_savings)
+                return cpu_core_map[cpu];
+        else
+                return c->llc_shared_map;
+}
+/* representing cpus for which sibling maps can be computed */
+static cpumask_t cpu_sibling_setup_map;
+static inline void set_cpu_sibling_map(int cpu)
+{
+        int i;
+        struct cpuinfo_x86 *c = cpu_data;
+        cpu_set(cpu, cpu_sibling_setup_map);
+        if (smp_num_siblings > 1) {
+                for_each_cpu_mask(i, cpu_sibling_setup_map) {
+                        if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+                            c[cpu].cpu_core_id == c[i].cpu_core_id) {
+                                cpu_set(i, cpu_sibling_map[cpu]);
+                                cpu_set(cpu, cpu_sibling_map[i]);
+                                cpu_set(i, cpu_core_map[cpu]);
+                                cpu_set(cpu, cpu_core_map[i]);
+                                cpu_set(i, c[cpu].llc_shared_map);
+                                cpu_set(cpu, c[i].llc_shared_map);
+                        }
+                }
+        } else {
+                cpu_set(cpu, cpu_sibling_map[cpu]);
+        }
+        cpu_set(cpu, c[cpu].llc_shared_map);
+        if (current_cpu_data.x86_max_cores == 1) {
+                cpu_core_map[cpu] = cpu_sibling_map[cpu];
+                c[cpu].booted_cores = 1;
+                return;
+        }
+        for_each_cpu_mask(i, cpu_sibling_setup_map) {
+                if (cpu_llc_id[cpu] != BAD_APICID &&
+                    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+                        cpu_set(i, c[cpu].llc_shared_map);
+                        cpu_set(cpu, c[i].llc_shared_map);
+                }
+                if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
+                        cpu_set(i, cpu_core_map[cpu]);
+                        cpu_set(cpu, cpu_core_map[i]);
+                        /*
+                         *  Does this new cpu bringup a new core?
+                         */
+                        if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+                                /*
+                                 * for each core in package, increment
+                                 * the booted_cores for this new cpu
+                                 */
+                                if (first_cpu(cpu_sibling_map[i]) == i)
+                                        c[cpu].booted_cores++;
+                                /*
+                                 * increment the core count for all
+                                 * the other cpus in this package
+                                 */
+                                if (i != cpu)
+                                        c[i].booted_cores++;
+                        } else if (i != cpu && !c[cpu].booted_cores)
+                                c[cpu].booted_cores = c[i].booted_cores;
+                }
+        }
+}
+/*
+ * Setup code on secondary processor (after comming out of the trampoline)
+ */
+void __cpuinit start_secondary(void)
+{
+        /*
+         * Dont put anything before smp_callin(), SMP
+         * booting is too fragile that we want to limit the
+         * things done here to the most necessary things.
+         */
+        cpu_init();
+        preempt_disable();
+        smp_callin();
+        /* otherwise gcc will move up the smp_processor_id before the cpu_init */
+        barrier();
+        /*
+         * Check TSC sync first:
+         */
+        check_tsc_sync_target();
+        Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());         
+        setup_secondary_APIC_clock();
+        Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
+        if (nmi_watchdog == NMI_IO_APIC) {
+                disable_8259A_irq(0);
+                enable_NMI_through_LVT0(NULL);
+                enable_8259A_irq(0);
+        }
+        enable_APIC_timer();
+        /*
+         * The sibling maps must be set before turing the online map on for
+         * this cpu
+         */
+        set_cpu_sibling_map(smp_processor_id());
+        /*
+         * We need to hold call_lock, so there is no inconsistency
+         * between the time smp_call_function() determines number of
+         * IPI receipients, and the time when the determination is made
+         * for which cpus receive the IPI in genapic_flat.c. Holding this
+         * lock helps us to not include this cpu in a currently in progress
+         * smp_call_function().
+         */
+        lock_ipi_call_lock();
+        spin_lock(&vector_lock);
+        /* Setup the per cpu irq handling data structures */
+        __setup_vector_irq(smp_processor_id());
+        /*
+         * Allow the master to continue.
+         */
+        cpu_set(smp_processor_id(), cpu_online_map);
+        per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+        spin_unlock(&vector_lock);
+        unlock_ipi_call_lock();
+        cpu_idle();
+}
+extern volatile unsigned long init_rsp;
+extern void (*initial_code)(void);
+#ifdef APIC_DEBUG
+static void inquire_remote_apic(int apicid)
+{
+        unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
+        char *names[] = { "ID", "VERSION", "SPIV" };
+        int timeout;
+        unsigned int status;
+        printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
+        for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
+                printk("... APIC #%d %s: ", apicid, names[i]);
+                /*
+                 * Wait for idle.
+                 */
+                status = safe_apic_wait_icr_idle();
+                if (status)
+                        printk("a previous APIC delivery may have failed\n");
+                apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
+                apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
+                timeout = 0;
+                do {
+                        udelay(100);
+                        status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
+                } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
+                switch (status) {
+                case APIC_ICR_RR_VALID:
+                        status = apic_read(APIC_RRR);
+                        printk("%08x\n", status);
+                        break;
+                default:
+                        printk("failed\n");
+                }
+        }
+}
+#endif
+/*
+ * Kick the secondary to wake up.
+ */
+static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
+{
+        unsigned long send_status, accept_status = 0;
+        int maxlvt, num_starts, j;
+        Dprintk("Asserting INIT.\n");
+        /*
+         * Turn INIT on target chip
+         */
+        apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+        /*
+         * Send IPI
+         */
+        apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
+                                | APIC_DM_INIT);
+        Dprintk("Waiting for send to finish...\n");
+        send_status = safe_apic_wait_icr_idle();
+        mdelay(10);
+        Dprintk("Deasserting INIT.\n");
+        /* Target chip */
+        apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+        /* Send IPI */
+        apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+        Dprintk("Waiting for send to finish...\n");
+        send_status = safe_apic_wait_icr_idle();
+        mb();
+        atomic_set(&init_deasserted, 1);
+        num_starts = 2;
+        /*
+         * Run STARTUP IPI loop.
+         */
+        Dprintk("#startup loops: %d.\n", num_starts);
+        maxlvt = get_maxlvt();
+        for (j = 1; j <= num_starts; j++) {
+                Dprintk("Sending STARTUP #%d.\n",j);
+                apic_write(APIC_ESR, 0);
+                apic_read(APIC_ESR);
+                Dprintk("After apic_write.\n");
+                /*
+                 * STARTUP IPI
+                 */
+                /* Target chip */
+                apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+                /* Boot on the stack */
+                /* Kick the second */
+                apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12));
+                /*
+                 * Give the other CPU some time to accept the IPI.
+                 */
+                udelay(300);
+                Dprintk("Startup point 1.\n");
+                Dprintk("Waiting for send to finish...\n");
+                send_status = safe_apic_wait_icr_idle();
+                /*
+                 * Give the other CPU some time to accept the IPI.
+                 */
+                udelay(200);
+                /*
+                 * Due to the Pentium erratum 3AP.
+                 */
+                if (maxlvt > 3) {
+                        apic_write(APIC_ESR, 0);
+                }
+                accept_status = (apic_read(APIC_ESR) & 0xEF);
+                if (send_status || accept_status)
+                        break;
+        }
+        Dprintk("After Startup.\n");
+        if (send_status)
+                printk(KERN_ERR "APIC never delivered???\n");
+        if (accept_status)
+                printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
+        return (send_status | accept_status);
+}
+struct create_idle {
+        struct work_struct work;
+        struct task_struct *idle;
+        struct completion done;
+        int cpu;
+};
+void do_fork_idle(struct work_struct *work)
+{
+        struct create_idle *c_idle =
+                container_of(work, struct create_idle, work);
+        c_idle->idle = fork_idle(c_idle->cpu);
+        complete(&c_idle->done);
+}
+/*
+ * Boot one CPU.
+ */
+static int __cpuinit do_boot_cpu(int cpu, int apicid)
+{
+        unsigned long boot_error;
+        int timeout;
+        unsigned long start_rip;
+        struct create_idle c_idle = {
+                .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle),
+                .cpu = cpu,
+                .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
+        };
+        /* allocate memory for gdts of secondary cpus. Hotplug is considered */
+        if (!cpu_gdt_descr[cpu].address &&
+                !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
+                printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
+                return -1;
+        }
+        /* Allocate node local memory for AP pdas */
+        if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
+                struct x8664_pda *newpda, *pda;
+                int node = cpu_to_node(cpu);
+                pda = cpu_pda(cpu);
+                newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC,
+                                      node);
+                if (newpda) {
+                        memcpy(newpda, pda, sizeof (struct x8664_pda));
+                        cpu_pda(cpu) = newpda;
+                } else
+                        printk(KERN_ERR
+                "Could not allocate node local PDA for CPU %d on node %d\n",
+                                cpu, node);
+        }
+        alternatives_smp_switch(1);
+        c_idle.idle = get_idle_for_cpu(cpu);
+        if (c_idle.idle) {
+                c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
+                        (THREAD_SIZE +  task_stack_page(c_idle.idle))) - 1);
+                init_idle(c_idle.idle, cpu);
+                goto do_rest;
+        }
+        /*
+         * During cold boot process, keventd thread is not spun up yet.
+         * When we do cpu hot-add, we create idle threads on the fly, we should
+         * not acquire any attributes from the calling context. Hence the clean
+         * way to create kernel_threads() is to do that from keventd().
+         * We do the current_is_keventd() due to the fact that ACPI notifier
+         * was also queuing to keventd() and when the caller is already running
+         * in context of keventd(), we would end up with locking up the keventd
+         * thread.
+         */
+        if (!keventd_up() || current_is_keventd())
+                c_idle.work.func(&c_idle.work);
+        else {
+                schedule_work(&c_idle.work);
+                wait_for_completion(&c_idle.done);
+        }
+        if (IS_ERR(c_idle.idle)) {
+                printk("failed fork for CPU %d\n", cpu);
+                return PTR_ERR(c_idle.idle);
+        }
+        set_idle_for_cpu(cpu, c_idle.idle);
+do_rest:
+        cpu_pda(cpu)->pcurrent = c_idle.idle;
+        start_rip = setup_trampoline();
+        init_rsp = c_idle.idle->thread.rsp;
+        per_cpu(init_tss,cpu).rsp0 = init_rsp;
+        initial_code = start_secondary;
+        clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+        printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu,
+                cpus_weight(cpu_present_map),
+                apicid);
+        /*
+         * This grunge runs the startup process for
+         * the targeted processor.
+         */
+        atomic_set(&init_deasserted, 0);
+        Dprintk("Setting warm reset code and vector.\n");
+        CMOS_WRITE(0xa, 0xf);
+        local_flush_tlb();
+        Dprintk("1.\n");
+        *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
+        Dprintk("2.\n");
+        *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
+        Dprintk("3.\n");
+        /*
+         * Be paranoid about clearing APIC errors.
+         */
+        apic_write(APIC_ESR, 0);
+        apic_read(APIC_ESR);
+        /*
+         * Status is now clean
+         */
+        boot_error = 0;
+        /*
+         * Starting actual IPI sequence...
+         */
+        boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
+        if (!boot_error) {
+                /*
+                 * allow APs to start initializing.
+                 */
+                Dprintk("Before Callout %d.\n", cpu);
+                cpu_set(cpu, cpu_callout_map);
+                Dprintk("After Callout %d.\n", cpu);
+                /*
+                 * Wait 5s total for a response
+                 */
+                for (timeout = 0; timeout < 50000; timeout++) {
+                        if (cpu_isset(cpu, cpu_callin_map))
+                                break;  /* It has booted */
+                        udelay(100);
+                }
+                if (cpu_isset(cpu, cpu_callin_map)) {
+                        /* number CPUs logically, starting from 1 (BSP is 0) */
+                        Dprintk("CPU has booted.\n");
+                } else {
+                        boot_error = 1;
+                        if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
+                                        == 0xA5)
+                                /* trampoline started but...? */
+                                printk("Stuck ??\n");
+                        else
+                                /* trampoline code not run */
+                                printk("Not responding.\n");
+#ifdef APIC_DEBUG
+                        inquire_remote_apic(apicid);
+#endif
+                }
+        }
+        if (boot_error) {
+                cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
+                clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+                clear_node_cpumask(cpu); /* was set by numa_add_cpu */
+                cpu_clear(cpu, cpu_present_map);
+                cpu_clear(cpu, cpu_possible_map);
+                x86_cpu_to_apicid[cpu] = BAD_APICID;
+                x86_cpu_to_log_apicid[cpu] = BAD_APICID;
+                return -EIO;
+        }
+        return 0;
+}
+cycles_t cacheflush_time;
+unsigned long cache_decay_ticks;
+/*
+ * Cleanup possible dangling ends...
+ */
+static __cpuinit void smp_cleanup_boot(void)
+{
+        /*
+         * Paranoid:  Set warm reset code and vector here back
+         * to default values.
+         */
+        CMOS_WRITE(0, 0xf);
+        /*
+         * Reset trampoline flag
+         */
+        *((volatile int *) phys_to_virt(0x467)) = 0;
+}
+/*
+ * Fall back to non SMP mode after errors.
+ *
+ * RED-PEN audit/test this more. I bet there is more state messed up here.
+ */
+static __init void disable_smp(void)
+{
+        cpu_present_map = cpumask_of_cpu(0);
+        cpu_possible_map = cpumask_of_cpu(0);
+        if (smp_found_config)
+                phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
+        else
+                phys_cpu_present_map = physid_mask_of_physid(0);
+        cpu_set(0, cpu_sibling_map[0]);
+        cpu_set(0, cpu_core_map[0]);
+}
+#ifdef CONFIG_HOTPLUG_CPU
+int additional_cpus __initdata = -1;
+/*
+ * cpu_possible_map should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_map on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * - Ashok Raj
+ *
+ * Three ways to find out the number of additional hotplug CPUs:
+ * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+ * - The user can overwrite it with additional_cpus=NUM
+ * - Otherwise don't reserve additional CPUs.
+ * We do this because additional CPUs waste a lot of memory.
+ * -AK
+ */
+__init void prefill_possible_map(void)
+{
+        int i;
+        int possible;
+        if (additional_cpus == -1) {
+                if (disabled_cpus > 0)
+                        additional_cpus = disabled_cpus;
+                else
+                        additional_cpus = 0;
+        }
+        possible = num_processors + additional_cpus;
+        if (possible > NR_CPUS) 
+                possible = NR_CPUS;
+        printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
+                possible,
+                max_t(int, possible - num_processors, 0));
+        for (i = 0; i < possible; i++)
+                cpu_set(i, cpu_possible_map);
+}
+#endif
+/*
+ * Various sanity checks.
+ */
+static int __init smp_sanity_check(unsigned max_cpus)
+{
+        if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
+                printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
+                       hard_smp_processor_id());
+                physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+        }
+        /*
+         * If we couldn't find an SMP configuration at boot time,
+         * get out of here now!
+         */
+        if (!smp_found_config) {
+                printk(KERN_NOTICE "SMP motherboard not detected.\n");
+                disable_smp();
+                if (APIC_init_uniprocessor())
+                        printk(KERN_NOTICE "Local APIC not detected."
+                                           " Using dummy APIC emulation.\n");
+                return -1;
+        }
+        /*
+         * Should not be necessary because the MP table should list the boot
+         * CPU too, but we do it for the sake of robustness anyway.
+         */
+        if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
+                printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
+                                                                 boot_cpu_id);
+                physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+        }
+        /*
+         * If we couldn't find a local APIC, then get out of here now!
+         */
+        if (!cpu_has_apic) {
+                printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+                        boot_cpu_id);
+                printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
+                nr_ioapics = 0;
+                return -1;
+        }
+        /*
+         * If SMP should be disabled, then really disable it!
+         */
+        if (!max_cpus) {
+                printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
+                nr_ioapics = 0;
+                return -1;
+        }
+        return 0;
+}
+/*
+ * Prepare for SMP bootup.  The MP table or ACPI has been read
+ * earlier.  Just do some sanity checking here and enable APIC mode.
+ */
+void __init smp_prepare_cpus(unsigned int max_cpus)
+{
+        nmi_watchdog_default();
+        current_cpu_data = boot_cpu_data;
+        current_thread_info()->cpu = 0;  /* needed? */
+        set_cpu_sibling_map(0);
+        if (smp_sanity_check(max_cpus) < 0) {
+                printk(KERN_INFO "SMP disabled\n");
+                disable_smp();
+                return;
+        }
+        /*
+         * Switch from PIC to APIC mode.
+         */
+        setup_local_APIC();
+        if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
+                panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
+                      GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
+                /* Or can we switch back to PIC here? */
+        }
+        /*
+         * Now start the IO-APICs
+         */
+        if (!skip_ioapic_setup && nr_ioapics)
+                setup_IO_APIC();
+        else
+                nr_ioapics = 0;
+        /*
+         * Set up local APIC timer on boot CPU.
+         */
+        setup_boot_APIC_clock();
+}
+/*
+ * Early setup to make printk work.
+ */
+void __init smp_prepare_boot_cpu(void)
+{
+        int me = smp_processor_id();
+        cpu_set(me, cpu_online_map);
+        cpu_set(me, cpu_callout_map);
+        per_cpu(cpu_state, me) = CPU_ONLINE;
+}
+/*
+ * Entry point to boot a CPU.
+ */
+int __cpuinit __cpu_up(unsigned int cpu)
+{
+        int apicid = cpu_present_to_apicid(cpu);
+        unsigned long flags;
+        int err;
+        WARN_ON(irqs_disabled());
+        Dprintk("++++++++++++++++++++=_---CPU UP  %u\n", cpu);
+        if (apicid == BAD_APICID || apicid == boot_cpu_id ||
+            !physid_isset(apicid, phys_cpu_present_map)) {
+                printk("__cpu_up: bad cpu %d\n", cpu);
+                return -EINVAL;
+        }
+        /*
+         * Already booted CPU?
+         */
+        if (cpu_isset(cpu, cpu_callin_map)) {
+                Dprintk("do_boot_cpu %d Already started\n", cpu);
+                return -ENOSYS;
+        }
+        /*
+         * Save current MTRR state in case it was changed since early boot
+         * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+         */
+        mtrr_save_state();
+        per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+        /* Boot it! */
+        err = do_boot_cpu(cpu, apicid);
+        if (err < 0) {
+                Dprintk("do_boot_cpu failed %d\n", err);
+                return err;
+        }
+        /* Unleash the CPU! */
+        Dprintk("waiting for cpu %d\n", cpu);
+        /*
+         * Make sure and check TSC sync:
+         */
+        local_irq_save(flags);
+        check_tsc_sync_source(cpu);
+        local_irq_restore(flags);
+        while (!cpu_isset(cpu, cpu_online_map))
+                cpu_relax();
+        err = 0;
+        return err;
+}
+/*
+ * Finish the SMP boot.
+ */
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+        smp_cleanup_boot();
+        setup_ioapic_dest();
+        check_nmi_watchdog();
+}
+#ifdef CONFIG_HOTPLUG_CPU
+static void remove_siblinginfo(int cpu)
+{
+        int sibling;
+        struct cpuinfo_x86 *c = cpu_data;
+        for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+                cpu_clear(cpu, cpu_core_map[sibling]);
+                /*
+                 * last thread sibling in this cpu core going down
+                 */
+                if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+                        c[sibling].booted_cores--;
+        }
+                        
+        for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+                cpu_clear(cpu, cpu_sibling_map[sibling]);
+        cpus_clear(cpu_sibling_map[cpu]);
+        cpus_clear(cpu_core_map[cpu]);
+        c[cpu].phys_proc_id = 0;
+        c[cpu].cpu_core_id = 0;
+        cpu_clear(cpu, cpu_sibling_setup_map);
+}
+void remove_cpu_from_maps(void)
+{
+        int cpu = smp_processor_id();
+        cpu_clear(cpu, cpu_callout_map);
+        cpu_clear(cpu, cpu_callin_map);
+        clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+        clear_node_cpumask(cpu);
+}
+int __cpu_disable(void)
+{
+        int cpu = smp_processor_id();
+        /*
+         * Perhaps use cpufreq to drop frequency, but that could go
+         * into generic code.
+         *
+         * We won't take down the boot processor on i386 due to some
+         * interrupts only being able to be serviced by the BSP.
+         * Especially so if we're not using an IOAPIC   -zwane
+         */
+        if (cpu == 0)
+                return -EBUSY;
+        if (nmi_watchdog == NMI_LOCAL_APIC)
+                stop_apic_nmi_watchdog(NULL);
+        clear_local_APIC();
+        /*
+         * HACK:
+         * Allow any queued timer interrupts to get serviced
+         * This is only a temporary solution until we cleanup
+         * fixup_irqs as we do for IA64.
+         */
+        local_irq_enable();
+        mdelay(1);
+        local_irq_disable();
+        remove_siblinginfo(cpu);
+        spin_lock(&vector_lock);
+        /* It's now safe to remove this processor from the online map */
+        cpu_clear(cpu, cpu_online_map);
+        spin_unlock(&vector_lock);
+        remove_cpu_from_maps();
+        fixup_irqs(cpu_online_map);
+        return 0;
+}
+void __cpu_die(unsigned int cpu)
+{
+        /* We don't do anything here: idle task is faking death itself. */
+        unsigned int i;
+        for (i = 0; i < 10; i++) {
+                /* They ack this in play_dead by setting CPU_DEAD */
+                if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+                        printk ("CPU %d is now offline\n", cpu);
+                        if (1 == num_online_cpus())
+                                alternatives_smp_switch(0);
+                        return;
+                }
+                msleep(100);
+        }
+        printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+static __init int setup_additional_cpus(char *s)
+{
+        return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
+}
+early_param("additional_cpus", setup_additional_cpus);
+#else /* ... !CONFIG_HOTPLUG_CPU */
+int __cpu_disable(void)
+{
+        return -ENOSYS;
+}
+void __cpu_die(unsigned int cpu)
+{
+        /* We said "no" in __cpu_disable */
+        BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
new file mode 100644
index 000000000000..bbfe85a0f699
--- /dev/null
+++ b/arch/x86/kernel/smpcommon_32.c
@@ -0,0 +1,81 @@
+/*
+ * SMP stuff which is common to all sub-architectures.
+ */
+#include <linux/module.h>
+#include <asm/smp.h>
+DEFINE_PER_CPU(unsigned long, this_cpu_off);
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
+/* Initialize the CPU's GDT.  This is either the boot CPU doing itself
+   (still using the master per-cpu area), or a CPU doing it for a
+   secondary which will soon come up. */
+__cpuinit void init_gdt(int cpu)
+{
+        struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+        pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
+                        (u32 *)&gdt[GDT_ENTRY_PERCPU].b,
+                        __per_cpu_offset[cpu], 0xFFFFF,
+                        0x80 | DESCTYPE_S | 0x2, 0x8);
+        per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
+        per_cpu(cpu_number, cpu) = cpu;
+}
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Unused.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
+                      int wait)
+{
+        return smp_call_function_mask(cpu_online_map, func, info, wait);
+}
+EXPORT_SYMBOL(smp_call_function);
+/**
+ * smp_call_function_single - Run a function on a specific CPU
+ * @cpu: The target CPU.  Cannot be the calling CPU.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ */
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+                             int nonatomic, int wait)
+{
+        /* prevent preemption and reschedule on another processor */
+        int ret;
+        int me = get_cpu();
+        if (cpu == me) {
+                local_irq_disable();
+                func(info);
+                local_irq_enable();
+                put_cpu();
+                return 0;
+        }
+        ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
+        put_cpu();
+        return ret;
+}
+EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
new file mode 100644
index 000000000000..2a8713ec0f9a
--- /dev/null
+++ b/arch/x86/kernel/srat_32.c
@@ -0,0 +1,360 @@
+/*
+ * Some of the code in this file has been gleaned from the 64 bit 
+ * discontigmem support code base.
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to Pat Gaughen <gone@us.ibm.com>
+ */
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/acpi.h>
+#include <linux/nodemask.h>
+#include <asm/srat.h>
+#include <asm/topology.h>
+#include <asm/smp.h>
+/*
+ * proximity macros and definitions
+ */
+#define NODE_ARRAY_INDEX(x)     ((x) / 8)       /* 8 bits/char */
+#define NODE_ARRAY_OFFSET(x)    ((x) % 8)       /* 8 bits/char */
+#define BMAP_SET(bmap, bit)     ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
+#define BMAP_TEST(bmap, bit)    ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
+/* bitmap length; _PXM is at most 255 */
+#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
+static u8 pxm_bitmap[PXM_BITMAP_LEN];   /* bitmap of proximity domains */
+#define MAX_CHUNKS_PER_NODE     3
+#define MAXCHUNKS               (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
+struct node_memory_chunk_s {
+        unsigned long   start_pfn;
+        unsigned long   end_pfn;
+        u8      pxm;            // proximity domain of node
+        u8      nid;            // which cnode contains this chunk?
+        u8      bank;           // which mem bank on this node
+};
+static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
+static int num_memory_chunks;           /* total number of memory chunks */
+static u8 __initdata apicid_to_pxm[MAX_APICID];
+extern void * boot_ioremap(unsigned long, unsigned long);
+/* Identify CPU proximity domains */
+static void __init parse_cpu_affinity_structure(char *p)
+{
+        struct acpi_srat_cpu_affinity *cpu_affinity =
+                                (struct acpi_srat_cpu_affinity *) p;
+        if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+                return;         /* empty entry */
+        /* mark this node as "seen" in node bitmap */
+        BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
+        apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
+        printk("CPU 0x%02X in proximity domain 0x%02X\n",
+                cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
+}
+/*
+ * Identify memory proximity domains and hot-remove capabilities.
+ * Fill node memory chunk list structure.
+ */
+static void __init parse_memory_affinity_structure (char *sratp)
+{
+        unsigned long long paddr, size;
+        unsigned long start_pfn, end_pfn;
+        u8 pxm;
+        struct node_memory_chunk_s *p, *q, *pend;
+        struct acpi_srat_mem_affinity *memory_affinity =
+                        (struct acpi_srat_mem_affinity *) sratp;
+        if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+                return;         /* empty entry */
+        pxm = memory_affinity->proximity_domain & 0xff;
+        /* mark this node as "seen" in node bitmap */
+        BMAP_SET(pxm_bitmap, pxm);
+        /* calculate info for memory chunk structure */
+        paddr = memory_affinity->base_address;
+        size = memory_affinity->length;
+        start_pfn = paddr >> PAGE_SHIFT;
+        end_pfn = (paddr + size) >> PAGE_SHIFT;
+        if (num_memory_chunks >= MAXCHUNKS) {
+                printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n",
+                        size/(1024*1024), paddr);
+                return;
+        }
+        /* Insertion sort based on base address */
+        pend = &node_memory_chunk[num_memory_chunks];
+        for (p = &node_memory_chunk[0]; p < pend; p++) {
+                if (start_pfn < p->start_pfn)
+                        break;
+        }
+        if (p < pend) {
+                for (q = pend; q >= p; q--)
+                        *(q + 1) = *q;
+        }
+        p->start_pfn = start_pfn;
+        p->end_pfn = end_pfn;
+        p->pxm = pxm;
+        num_memory_chunks++;
+        printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n",
+                start_pfn, end_pfn,
+                memory_affinity->memory_type,
+                pxm,
+                ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
+                 "enabled and removable" : "enabled" ) );
+}
+/*
+ * The SRAT table always lists ascending addresses, so can always
+ * assume that the first "start" address that you see is the real
+ * start of the node, and that the current "end" address is after
+ * the previous one.
+ */
+static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
+{
+        /*
+         * Only add present memory as told by the e820.
+         * There is no guarantee from the SRAT that the memory it
+         * enumerates is present at boot time because it represents
+         * *possible* memory hotplug areas the same as normal RAM.
+         */
+        if (memory_chunk->start_pfn >= max_pfn) {
+                printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n",
+                        memory_chunk->start_pfn, memory_chunk->end_pfn);
+                return;
+        }
+        if (memory_chunk->nid != nid)
+                return;
+        if (!node_has_online_mem(nid))
+                node_start_pfn[nid] = memory_chunk->start_pfn;
+        if (node_start_pfn[nid] > memory_chunk->start_pfn)
+                node_start_pfn[nid] = memory_chunk->start_pfn;
+        if (node_end_pfn[nid] < memory_chunk->end_pfn)
+                node_end_pfn[nid] = memory_chunk->end_pfn;
+}
+/* Parse the ACPI Static Resource Affinity Table */
+static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
+{
+        u8 *start, *end, *p;
+        int i, j, nid;
+        start = (u8 *)(&(sratp->reserved) + 1); /* skip header */
+        p = start;
+        end = (u8 *)sratp + sratp->header.length;
+        memset(pxm_bitmap, 0, sizeof(pxm_bitmap));      /* init proximity domain bitmap */
+        memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
+        num_memory_chunks = 0;
+        while (p < end) {
+                switch (*p) {
+                case ACPI_SRAT_TYPE_CPU_AFFINITY:
+                        parse_cpu_affinity_structure(p);
+                        break;
+                case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
+                        parse_memory_affinity_structure(p);
+                        break;
+                default:
+                        printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
+                        break;
+                }
+                p += p[1];
+                if (p[1] == 0) {
+                        printk("acpi20_parse_srat: Entry length value is zero;"
+                                " can't parse any further!\n");
+                        break;
+                }
+        }
+        if (num_memory_chunks == 0) {
+                printk("could not finy any ACPI SRAT memory areas.\n");
+                goto out_fail;
+        }
+        /* Calculate total number of nodes in system from PXM bitmap and create
+         * a set of sequential node IDs starting at zero.  (ACPI doesn't seem
+         * to specify the range of _PXM values.)
+         */
+        /*
+         * MCD - we no longer HAVE to number nodes sequentially.  PXM domain
+         * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
+         * 32, so we will continue numbering them in this manner until MAX_NUMNODES
+         * approaches MAX_PXM_DOMAINS for i386.
+         */
+        nodes_clear(node_online_map);
+        for (i = 0; i < MAX_PXM_DOMAINS; i++) {
+                if (BMAP_TEST(pxm_bitmap, i)) {
+                        int nid = acpi_map_pxm_to_node(i);
+                        node_set_online(nid);
+                }
+        }
+        BUG_ON(num_online_nodes() == 0);
+        /* set cnode id in memory chunk structure */
+        for (i = 0; i < num_memory_chunks; i++)
+                node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
+        printk("pxm bitmap: ");
+        for (i = 0; i < sizeof(pxm_bitmap); i++) {
+                printk("%02X ", pxm_bitmap[i]);
+        }
+        printk("\n");
+        printk("Number of logical nodes in system = %d\n", num_online_nodes());
+        printk("Number of memory chunks in system = %d\n", num_memory_chunks);
+        for (i = 0; i < MAX_APICID; i++)
+                apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
+        for (j = 0; j < num_memory_chunks; j++){
+                struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
+                printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
+                       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
+                node_read_chunk(chunk->nid, chunk);
+                add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
+        }
+ 
+        for_each_online_node(nid) {
+                unsigned long start = node_start_pfn[nid];
+                unsigned long end = node_end_pfn[nid];
+                memory_present(nid, start, end);
+                node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
+        }
+        return 1;
+out_fail:
+        return 0;
+}
+struct acpi_static_rsdt {
+        struct acpi_table_rsdt table;
+        u32 padding[7]; /* Allow for 7 more table entries */
+};
+int __init get_memcfg_from_srat(void)
+{
+        struct acpi_table_header *header = NULL;
+        struct acpi_table_rsdp *rsdp = NULL;
+        struct acpi_table_rsdt *rsdt = NULL;
+        acpi_native_uint rsdp_address = 0;
+        struct acpi_static_rsdt saved_rsdt;
+        int tables = 0;
+        int i = 0;
+        rsdp_address = acpi_find_rsdp();
+        if (!rsdp_address) {
+                printk("%s: System description tables not found\n",
+                       __FUNCTION__);
+                goto out_err;
+        }
+        printk("%s: assigning address to rsdp\n", __FUNCTION__);
+        rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
+        if (!rsdp) {
+                printk("%s: Didn't find ACPI root!\n", __FUNCTION__);
+                goto out_err;
+        }
+        printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
+                rsdp->oem_id);
+        if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
+                printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__);
+                goto out_err;
+        }
+        rsdt = (struct acpi_table_rsdt *)
+            boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
+        if (!rsdt) {
+                printk(KERN_WARNING
+                       "%s: ACPI: Invalid root system description tables (RSDT)\n",
+                       __FUNCTION__);
+                goto out_err;
+        }
+        header = &rsdt->header;
+        if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
+                printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
+                goto out_err;
+        }
+        /* 
+         * The number of tables is computed by taking the 
+         * size of all entries (header size minus total 
+         * size of RSDT) divided by the size of each entry
+         * (4-byte table pointers).
+         */
+        tables = (header->length - sizeof(struct acpi_table_header)) / 4;
+        if (!tables)
+                goto out_err;
+        memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
+        if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
+                printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
+                       saved_rsdt.table.header.length);
+                goto out_err;
+        }
+        printk("Begin SRAT table scan....\n");
+        for (i = 0; i < tables; i++) {
+                /* Map in header, then map in full table length. */
+                header = (struct acpi_table_header *)
+                        boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
+                if (!header)
+                        break;
+                header = (struct acpi_table_header *)
+                        boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
+                if (!header)
+                        break;
+                if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
+                        continue;
+                /* we've found the srat table. don't need to look at any more tables */
+                return acpi20_parse_srat((struct acpi_table_srat *)header);
+        }
+out_err:
+        remove_all_active_ranges();
+        printk("failed to get NUMA memory information from SRAT table\n");
+        return 0;
+}
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
new file mode 100644
index 000000000000..cb9109113584
--- /dev/null
+++ b/arch/x86/kernel/stacktrace.c
@@ -0,0 +1,54 @@
+/*
+ * arch/x86_64/kernel/stacktrace.c
+ *
+ * Stack trace management functions
+ *
+ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+#include <linux/module.h>
+#include <asm/stacktrace.h>
+static void save_stack_warning(void *data, char *msg)
+{
+}
+static void
+save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+}
+static int save_stack_stack(void *data, char *name)
+{
+        return -1;
+}
+static void save_stack_address(void *data, unsigned long addr)
+{
+        struct stack_trace *trace = (struct stack_trace *)data;
+        if (trace->skip > 0) {
+                trace->skip--;
+                return;
+        }
+        if (trace->nr_entries < trace->max_entries)
+                trace->entries[trace->nr_entries++] = addr;
+}
+static struct stacktrace_ops save_stack_ops = {
+        .warning = save_stack_warning,
+        .warning_symbol = save_stack_warning_symbol,
+        .stack = save_stack_stack,
+        .address = save_stack_address,
+};
+/*
+ * Save stack-backtrace addresses into a stack_trace buffer.
+ */
+void save_stack_trace(struct stack_trace *trace)
+{
+        dump_trace(current, NULL, NULL, &save_stack_ops, trace);
+        if (trace->nr_entries < trace->max_entries)
+                trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+EXPORT_SYMBOL(save_stack_trace);
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
new file mode 100644
index 000000000000..d0e01a3acf35
--- /dev/null
+++ b/arch/x86/kernel/summit_32.c
@@ -0,0 +1,180 @@
+/*
+ * arch/i386/kernel/summit.c - IBM Summit-Specific Code
+ *
+ * Written By: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (c) 2003 IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ *
+ */
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <asm/io.h>
+#include <asm/mach-summit/mach_mpparse.h>
+static struct rio_table_hdr *rio_table_hdr __initdata;
+static struct scal_detail   *scal_devs[MAX_NUMNODES] __initdata;
+static struct rio_detail    *rio_devs[MAX_NUMNODES*4] __initdata;
+static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
+{
+        int twister = 0, node = 0;
+        int i, bus, num_buses;
+        for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
+                if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id){
+                        twister = rio_devs[i]->owner_id;
+                        break;
+                }
+        }
+        if (i == rio_table_hdr->num_rio_dev){
+                printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__);
+                return last_bus;
+        }
+        for(i = 0; i < rio_table_hdr->num_scal_dev; i++){
+                if (scal_devs[i]->node_id == twister){
+                        node = scal_devs[i]->node_id;
+                        break;
+                }
+        }
+        if (i == rio_table_hdr->num_scal_dev){
+                printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__);
+                return last_bus;
+        }
+        switch (rio_devs[wpeg_num]->type){
+        case CompatWPEG:
+                /* The Compatability Winnipeg controls the 2 legacy buses,
+                 * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
+                 * a PCI-PCI bridge card is used in either slot: total 5 buses.
+                 */
+                num_buses = 5;
+                break;
+        case AltWPEG:
+                /* The Alternate Winnipeg controls the 2 133MHz buses [1 slot
+                 * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
+                 * the "extra" buses for each of those slots: total 7 buses.
+                 */
+                num_buses = 7;
+                break;
+        case LookOutAWPEG:
+        case LookOutBWPEG:
+                /* A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
+                 * & the "extra" buses for each of those slots: total 9 buses.
+                 */
+                num_buses = 9;
+                break;
+        default:
+                printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __FUNCTION__);
+                return last_bus;
+        }
+        for(bus = last_bus; bus < last_bus + num_buses; bus++)
+                mp_bus_id_to_node[bus] = node;
+        return bus;
+}
+static int __init build_detail_arrays(void)
+{
+        unsigned long ptr;
+        int i, scal_detail_size, rio_detail_size;
+        if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
+                printk(KERN_WARNING "%s: MAX_NUMNODES too low!  Defined as %d, but system has %d nodes.\n", __FUNCTION__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
+                return 0;
+        }
+        switch (rio_table_hdr->version){
+        default:
+                printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __FUNCTION__, rio_table_hdr->version);
+                return 0;
+        case 2:
+                scal_detail_size = 11;
+                rio_detail_size = 13;
+                break;
+        case 3:
+                scal_detail_size = 12;
+                rio_detail_size = 15;
+                break;
+        }
+        ptr = (unsigned long)rio_table_hdr + 3;
+        for(i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
+                scal_devs[i] = (struct scal_detail *)ptr;
+        for(i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
+                rio_devs[i] = (struct rio_detail *)ptr;
+        return 1;
+}
+void __init setup_summit(void)
+{
+        unsigned long           ptr;
+        unsigned short          offset;
+        int                     i, next_wpeg, next_bus = 0;
+        /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
+        ptr = *(unsigned short *)phys_to_virt(0x40Eul);
+        ptr = (unsigned long)phys_to_virt(ptr << 4);
+        rio_table_hdr = NULL;
+        offset = 0x180;
+        while (offset){
+                /* The block id is stored in the 2nd word */
+                if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
+                        /* set the pointer past the offset & block id */
+                        rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
+                        break;
+                }
+                /* The next offset is stored in the 1st word.  0 means no more */
+                offset = *((unsigned short *)(ptr + offset));
+        }
+        if (!rio_table_hdr){
+                printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__);
+                return;
+        }
+        if (!build_detail_arrays())
+                return;
+        /* The first Winnipeg we're looking for has an index of 0 */
+        next_wpeg = 0;
+        do {
+                for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
+                        if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg){
+                                /* It's the Winnipeg we're looking for! */
+                                next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
+                                next_wpeg++;
+                                break;
+                        }
+                }
+                /*
+                 * If we go through all Rio devices and don't find one with
+                 * the next index, it means we've found all the Winnipegs,
+                 * and thus all the PCI buses.
+                 */
+                if (i == rio_table_hdr->num_rio_dev)
+                        next_wpeg = 0;
+        } while (next_wpeg != 0);
+}
diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c
new file mode 100644
index 000000000000..573c0a6e0ac6
--- /dev/null
+++ b/arch/x86/kernel/suspend_64.c
@@ -0,0 +1,239 @@
+/*
+ * Suspend support specific for i386.
+ *
+ * Distribute under GPLv2
+ *
+ * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
+ */
+#include <linux/smp.h>
+#include <linux/suspend.h>
+#include <asm/proto.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/mtrr.h>
+/* References to section boundaries */
+extern const void __nosave_begin, __nosave_end;
+struct saved_context saved_context;
+unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx;
+unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi;
+unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11;
+unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15;
+unsigned long saved_context_eflags;
+void __save_processor_state(struct saved_context *ctxt)
+{
+        kernel_fpu_begin();
+        /*
+         * descriptor tables
+         */
+        asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
+        asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
+        asm volatile ("str %0"  : "=m" (ctxt->tr));
+        /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
+        /*
+         * segment registers
+         */
+        asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
+        asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
+        asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
+        asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
+        asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
+        rdmsrl(MSR_FS_BASE, ctxt->fs_base);
+        rdmsrl(MSR_GS_BASE, ctxt->gs_base);
+        rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+        mtrr_save_fixed_ranges(NULL);
+        /*
+         * control registers 
+         */
+        rdmsrl(MSR_EFER, ctxt->efer);
+        ctxt->cr0 = read_cr0();
+        ctxt->cr2 = read_cr2();
+        ctxt->cr3 = read_cr3();
+        ctxt->cr4 = read_cr4();
+        ctxt->cr8 = read_cr8();
+}
+void save_processor_state(void)
+{
+        __save_processor_state(&saved_context);
+}
+static void do_fpu_end(void)
+{
+        /*
+         * Restore FPU regs if necessary
+         */
+        kernel_fpu_end();
+}
+void __restore_processor_state(struct saved_context *ctxt)
+{
+        /*
+         * control registers
+         */
+        wrmsrl(MSR_EFER, ctxt->efer);
+        write_cr8(ctxt->cr8);
+        write_cr4(ctxt->cr4);
+        write_cr3(ctxt->cr3);
+        write_cr2(ctxt->cr2);
+        write_cr0(ctxt->cr0);
+        /*
+         * now restore the descriptor tables to their proper values
+         * ltr is done i fix_processor_context().
+         */
+        asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
+        asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+        /*
+         * segment registers
+         */
+        asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
+        asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
+        asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
+        load_gs_index(ctxt->gs);
+        asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
+        wrmsrl(MSR_FS_BASE, ctxt->fs_base);
+        wrmsrl(MSR_GS_BASE, ctxt->gs_base);
+        wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+        fix_processor_context();
+        do_fpu_end();
+        mtrr_ap_init();
+}
+void restore_processor_state(void)
+{
+        __restore_processor_state(&saved_context);
+}
+void fix_processor_context(void)
+{
+        int cpu = smp_processor_id();
+        struct tss_struct *t = &per_cpu(init_tss, cpu);
+        set_tss_desc(cpu,t);    /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
+        cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9;
+        syscall_init();                         /* This sets MSR_*STAR and related */
+        load_TR_desc();                         /* This does ltr */
+        load_LDT(&current->active_mm->context); /* This does lldt */
+        /*
+         * Now maybe reload the debug registers
+         */
+        if (current->thread.debugreg7){
+                loaddebug(&current->thread, 0);
+                loaddebug(&current->thread, 1);
+                loaddebug(&current->thread, 2);
+                loaddebug(&current->thread, 3);
+                /* no 4 and 5 */
+                loaddebug(&current->thread, 6);
+                loaddebug(&current->thread, 7);
+        }
+}
+#ifdef CONFIG_HIBERNATION
+/* Defined in arch/x86_64/kernel/suspend_asm.S */
+extern int restore_image(void);
+pgd_t *temp_level4_pgt;
+static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+{
+        long i, j;
+        i = pud_index(address);
+        pud = pud + i;
+        for (; i < PTRS_PER_PUD; pud++, i++) {
+                unsigned long paddr;
+                pmd_t *pmd;
+                paddr = address + i*PUD_SIZE;
+                if (paddr >= end)
+                        break;
+                pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
+                if (!pmd)
+                        return -ENOMEM;
+                set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+                for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
+                        unsigned long pe;
+                        if (paddr >= end)
+                                break;
+                        pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr;
+                        pe &= __supported_pte_mask;
+                        set_pmd(pmd, __pmd(pe));
+                }
+        }
+        return 0;
+}
+static int set_up_temporary_mappings(void)
+{
+        unsigned long start, end, next;
+        int error;
+        temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
+        if (!temp_level4_pgt)
+                return -ENOMEM;
+        /* It is safe to reuse the original kernel mapping */
+        set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
+                init_level4_pgt[pgd_index(__START_KERNEL_map)]);
+        /* Set up the direct mapping from scratch */
+        start = (unsigned long)pfn_to_kaddr(0);
+        end = (unsigned long)pfn_to_kaddr(end_pfn);
+        for (; start < end; start = next) {
+                pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
+                if (!pud)
+                        return -ENOMEM;
+                next = start + PGDIR_SIZE;
+                if (next > end)
+                        next = end;
+                if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
+                        return error;
+                set_pgd(temp_level4_pgt + pgd_index(start),
+                        mk_kernel_pgd(__pa(pud)));
+        }
+        return 0;
+}
+int swsusp_arch_resume(void)
+{
+        int error;
+        /* We have got enough memory and from now on we cannot recover */
+        if ((error = set_up_temporary_mappings()))
+                return error;
+        restore_image();
+        return 0;
+}
+/*
+ *      pfn_is_nosave - check if given pfn is in the 'nosave' section
+ */
+int pfn_is_nosave(unsigned long pfn)
+{
+        unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
+        unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
+        return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+#endif /* CONFIG_HIBERNATION */
diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S
new file mode 100644
index 000000000000..16d183f67bc1
--- /dev/null
+++ b/arch/x86/kernel/suspend_asm_64.S
@@ -0,0 +1,110 @@
+/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * Distribute under GPLv2.
+ *
+ * swsusp_arch_resume may not use any stack, nor any variable that is
+ * not "NoSave" during copying pages:
+ *
+ * Its rewriting one kernel image with another. What is stack in "old"
+ * image could very well be data page in "new" image, and overwriting
+ * your own stack under you is bad idea.
+ */
+        
+        .text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+ENTRY(swsusp_arch_suspend)
+        movq %rsp, saved_context_esp(%rip)
+        movq %rax, saved_context_eax(%rip)
+        movq %rbx, saved_context_ebx(%rip)
+        movq %rcx, saved_context_ecx(%rip)
+        movq %rdx, saved_context_edx(%rip)
+        movq %rbp, saved_context_ebp(%rip)
+        movq %rsi, saved_context_esi(%rip)
+        movq %rdi, saved_context_edi(%rip)
+        movq %r8,  saved_context_r08(%rip)
+        movq %r9,  saved_context_r09(%rip)
+        movq %r10, saved_context_r10(%rip)
+        movq %r11, saved_context_r11(%rip)
+        movq %r12, saved_context_r12(%rip)
+        movq %r13, saved_context_r13(%rip)
+        movq %r14, saved_context_r14(%rip)
+        movq %r15, saved_context_r15(%rip)
+        pushfq ; popq saved_context_eflags(%rip)
+        call swsusp_save
+        ret
+ENTRY(restore_image)
+        /* switch to temporary page tables */
+        movq    $__PAGE_OFFSET, %rdx
+        movq    temp_level4_pgt(%rip), %rax
+        subq    %rdx, %rax
+        movq    %rax, %cr3
+        /* Flush TLB */
+        movq    mmu_cr4_features(%rip), %rax
+        movq    %rax, %rdx
+        andq    $~(1<<7), %rdx  # PGE
+        movq    %rdx, %cr4;  # turn off PGE
+        movq    %cr3, %rcx;  # flush TLB
+        movq    %rcx, %cr3;
+        movq    %rax, %cr4;  # turn PGE back on
+        movq    restore_pblist(%rip), %rdx
+loop:
+        testq   %rdx, %rdx
+        jz      done
+        /* get addresses from the pbe and copy the page */
+        movq    pbe_address(%rdx), %rsi
+        movq    pbe_orig_address(%rdx), %rdi
+        movq    $512, %rcx
+        rep
+        movsq
+        /* progress to the next pbe */
+        movq    pbe_next(%rdx), %rdx
+        jmp     loop
+done:
+        /* go back to the original page tables */
+        movq    $(init_level4_pgt - __START_KERNEL_map), %rax
+        addq    phys_base(%rip), %rax
+        movq    %rax, %cr3
+        /* Flush TLB, including "global" things (vmalloc) */
+        movq    mmu_cr4_features(%rip), %rax
+        movq    %rax, %rdx
+        andq    $~(1<<7), %rdx;  # PGE
+        movq    %rdx, %cr4;  # turn off PGE
+        movq    %cr3, %rcx;  # flush TLB
+        movq    %rcx, %cr3
+        movq    %rax, %cr4;  # turn PGE back on
+        movl    $24, %eax
+        movl    %eax, %ds
+        movq saved_context_esp(%rip), %rsp
+        movq saved_context_ebp(%rip), %rbp
+        /* Don't restore %rax, it must be 0 anyway */
+        movq saved_context_ebx(%rip), %rbx
+        movq saved_context_ecx(%rip), %rcx
+        movq saved_context_edx(%rip), %rdx
+        movq saved_context_esi(%rip), %rsi
+        movq saved_context_edi(%rip), %rdi
+        movq saved_context_r08(%rip), %r8
+        movq saved_context_r09(%rip), %r9
+        movq saved_context_r10(%rip), %r10
+        movq saved_context_r11(%rip), %r11
+        movq saved_context_r12(%rip), %r12
+        movq saved_context_r13(%rip), %r13
+        movq saved_context_r14(%rip), %r14
+        movq saved_context_r15(%rip), %r15
+        pushq saved_context_eflags(%rip) ; popfq
+        xorq    %rax, %rax
+        ret
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
new file mode 100644
index 000000000000..42147304de88
--- /dev/null
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -0,0 +1,265 @@
+/*
+ * linux/arch/i386/kernel/sys_i386.c
+ *
+ * This file contains various random system calls that
+ * have a non-standard calling sequence on the Linux/i386
+ * platform.
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/stat.h>
+#include <linux/syscalls.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/ipc.h>
+/*
+ * sys_pipe() is the normal C calling standard for creating
+ * a pipe. It's not the way Unix traditionally does this, though.
+ */
+asmlinkage int sys_pipe(unsigned long __user * fildes)
+{
+        int fd[2];
+        int error;
+        error = do_pipe(fd);
+        if (!error) {
+                if (copy_to_user(fildes, fd, 2*sizeof(int)))
+                        error = -EFAULT;
+        }
+        return error;
+}
+asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
+                          unsigned long prot, unsigned long flags,
+                          unsigned long fd, unsigned long pgoff)
+{
+        int error = -EBADF;
+        struct file *file = NULL;
+        struct mm_struct *mm = current->mm;
+        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+        if (!(flags & MAP_ANONYMOUS)) {
+                file = fget(fd);
+                if (!file)
+                        goto out;
+        }
+        down_write(&mm->mmap_sem);
+        error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+        up_write(&mm->mmap_sem);
+        if (file)
+                fput(file);
+out:
+        return error;
+}
+/*
+ * Perform the select(nd, in, out, ex, tv) and mmap() system
+ * calls. Linux/i386 didn't use to be able to handle more than
+ * 4 system call parameters, so these system calls used a memory
+ * block for parameter passing..
+ */
+struct mmap_arg_struct {
+        unsigned long addr;
+        unsigned long len;
+        unsigned long prot;
+        unsigned long flags;
+        unsigned long fd;
+        unsigned long offset;
+};
+asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
+{
+        struct mmap_arg_struct a;
+        int err = -EFAULT;
+        if (copy_from_user(&a, arg, sizeof(a)))
+                goto out;
+        err = -EINVAL;
+        if (a.offset & ~PAGE_MASK)
+                goto out;
+        err = sys_mmap2(a.addr, a.len, a.prot, a.flags,
+                        a.fd, a.offset >> PAGE_SHIFT);
+out:
+        return err;
+}
+struct sel_arg_struct {
+        unsigned long n;
+        fd_set __user *inp, *outp, *exp;
+        struct timeval __user *tvp;
+};
+asmlinkage int old_select(struct sel_arg_struct __user *arg)
+{
+        struct sel_arg_struct a;
+        if (copy_from_user(&a, arg, sizeof(a)))
+                return -EFAULT;
+        /* sys_select() does the appropriate kernel locking */
+        return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
+}
+/*
+ * sys_ipc() is the de-multiplexer for the SysV IPC calls..
+ *
+ * This is really horribly ugly.
+ */
+asmlinkage int sys_ipc (uint call, int first, int second,
+                        int third, void __user *ptr, long fifth)
+{
+        int version, ret;
+        version = call >> 16; /* hack for backward compatibility */
+        call &= 0xffff;
+        switch (call) {
+        case SEMOP:
+                return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL);
+        case SEMTIMEDOP:
+                return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
+                                        (const struct timespec __user *)fifth);
+        case SEMGET:
+                return sys_semget (first, second, third);
+        case SEMCTL: {
+                union semun fourth;
+                if (!ptr)
+                        return -EINVAL;
+                if (get_user(fourth.__pad, (void __user * __user *) ptr))
+                        return -EFAULT;
+                return sys_semctl (first, second, third, fourth);
+        }
+        case MSGSND:
+                return sys_msgsnd (first, (struct msgbuf __user *) ptr, 
+                                   second, third);
+        case MSGRCV:
+                switch (version) {
+                case 0: {
+                        struct ipc_kludge tmp;
+                        if (!ptr)
+                                return -EINVAL;
+                        
+                        if (copy_from_user(&tmp,
+                                           (struct ipc_kludge __user *) ptr, 
+                                           sizeof (tmp)))
+                                return -EFAULT;
+                        return sys_msgrcv (first, tmp.msgp, second,
+                                           tmp.msgtyp, third);
+                }
+                default:
+                        return sys_msgrcv (first,
+                                           (struct msgbuf __user *) ptr,
+                                           second, fifth, third);
+                }
+        case MSGGET:
+                return sys_msgget ((key_t) first, second);
+        case MSGCTL:
+                return sys_msgctl (first, second, (struct msqid_ds __user *) ptr);
+        case SHMAT:
+                switch (version) {
+                default: {
+                        ulong raddr;
+                        ret = do_shmat (first, (char __user *) ptr, second, &raddr);
+                        if (ret)
+                                return ret;
+                        return put_user (raddr, (ulong __user *) third);
+                }
+                case 1: /* iBCS2 emulator entry point */
+                        if (!segment_eq(get_fs(), get_ds()))
+                                return -EINVAL;
+                        /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
+                        return do_shmat (first, (char __user *) ptr, second, (ulong *) third);
+                }
+        case SHMDT: 
+                return sys_shmdt ((char __user *)ptr);
+        case SHMGET:
+                return sys_shmget (first, second, third);
+        case SHMCTL:
+                return sys_shmctl (first, second,
+                                   (struct shmid_ds __user *) ptr);
+        default:
+                return -ENOSYS;
+        }
+}
+/*
+ * Old cruft
+ */
+asmlinkage int sys_uname(struct old_utsname __user * name)
+{
+        int err;
+        if (!name)
+                return -EFAULT;
+        down_read(&uts_sem);
+        err = copy_to_user(name, utsname(), sizeof (*name));
+        up_read(&uts_sem);
+        return err?-EFAULT:0;
+}
+asmlinkage int sys_olduname(struct oldold_utsname __user * name)
+{
+        int error;
+        if (!name)
+                return -EFAULT;
+        if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
+                return -EFAULT;
+  
+        down_read(&uts_sem);
+        
+        error = __copy_to_user(&name->sysname, &utsname()->sysname,
+                               __OLD_UTS_LEN);
+        error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->nodename, &utsname()->nodename,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->release, &utsname()->release,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->release + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->version, &utsname()->version,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->version + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->machine, &utsname()->machine,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->machine + __OLD_UTS_LEN);
+        
+        up_read(&uts_sem);
+        
+        error = error ? -EFAULT : 0;
+        return error;
+}
+/*
+ * Do a system call from kernel instead of calling sys_execve so we
+ * end up with proper pt_regs.
+ */
+int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+{
+        long __res;
+        asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
+        : "=a" (__res)
+        : "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory");
+        return __res;
+}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
new file mode 100644
index 000000000000..4770b7a2052c
--- /dev/null
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -0,0 +1,159 @@
+/*
+ * linux/arch/x86_64/kernel/sys_x86_64.c
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/stat.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/personality.h>
+#include <asm/uaccess.h>
+#include <asm/ia32.h>
+/*
+ * sys_pipe() is the normal C calling standard for creating
+ * a pipe. It's not the way Unix traditionally does this, though.
+ */
+asmlinkage long sys_pipe(int __user *fildes)
+{
+        int fd[2];
+        int error;
+        error = do_pipe(fd);
+        if (!error) {
+                if (copy_to_user(fildes, fd, 2*sizeof(int)))
+                        error = -EFAULT;
+        }
+        return error;
+}
+asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
+        unsigned long fd, unsigned long off)
+{
+        long error;
+        struct file * file;
+        error = -EINVAL;
+        if (off & ~PAGE_MASK)
+                goto out;
+        error = -EBADF;
+        file = NULL;
+        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+        if (!(flags & MAP_ANONYMOUS)) {
+                file = fget(fd);
+                if (!file)
+                        goto out;
+        }
+        down_write(&current->mm->mmap_sem);
+        error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
+        up_write(&current->mm->mmap_sem);
+        if (file)
+                fput(file);
+out:
+        return error;
+}
+static void find_start_end(unsigned long flags, unsigned long *begin,
+                           unsigned long *end)
+{
+        if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
+                /* This is usually used needed to map code in small
+                   model, so it needs to be in the first 31bit. Limit
+                   it to that.  This means we need to move the
+                   unmapped base down for this case. This can give
+                   conflicts with the heap, but we assume that glibc
+                   malloc knows how to fall back to mmap. Give it 1GB
+                   of playground for now. -AK */ 
+                *begin = 0x40000000; 
+                *end = 0x80000000;              
+        } else {
+                *begin = TASK_UNMAPPED_BASE;
+                *end = TASK_SIZE; 
+        }
+} 
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+                unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma;
+        unsigned long start_addr;
+        unsigned long begin, end;
+        
+        if (flags & MAP_FIXED)
+                return addr;
+        find_start_end(flags, &begin, &end); 
+        if (len > end)
+                return -ENOMEM;
+        if (addr) {
+                addr = PAGE_ALIGN(addr);
+                vma = find_vma(mm, addr);
+                if (end - len >= addr &&
+                    (!vma || addr + len <= vma->vm_start))
+                        return addr;
+        }
+        if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
+            && len <= mm->cached_hole_size) {
+                mm->cached_hole_size = 0;
+                mm->free_area_cache = begin;
+        }
+        addr = mm->free_area_cache;
+        if (addr < begin) 
+                addr = begin; 
+        start_addr = addr;
+full_search:
+        for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+                /* At this point:  (!vma || addr < vma->vm_end). */
+                if (end - len < addr) {
+                        /*
+                         * Start a new search - just in case we missed
+                         * some holes.
+                         */
+                        if (start_addr != begin) {
+                                start_addr = addr = begin;
+                                mm->cached_hole_size = 0;
+                                goto full_search;
+                        }
+                        return -ENOMEM;
+                }
+                if (!vma || addr + len <= vma->vm_start) {
+                        /*
+                         * Remember the place where we stopped the search:
+                         */
+                        mm->free_area_cache = addr + len;
+                        return addr;
+                }
+                if (addr + mm->cached_hole_size < vma->vm_start)
+                        mm->cached_hole_size = vma->vm_start - addr;
+                addr = vma->vm_end;
+        }
+}
+asmlinkage long sys_uname(struct new_utsname __user * name)
+{
+        int err;
+        down_read(&uts_sem);
+        err = copy_to_user(name, utsname(), sizeof (*name));
+        up_read(&uts_sem);
+        if (personality(current->personality) == PER_LINUX32) 
+                err |= copy_to_user(&name->machine, "i686", 5);                 
+        return err ? -EFAULT : 0;
+}
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
new file mode 100644
index 000000000000..9d498c2f8eea
--- /dev/null
+++ b/arch/x86/kernel/syscall_64.c
@@ -0,0 +1,26 @@
+/* System call table for x86-64. */ 
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <asm/asm-offsets.h>
+#define __NO_STUBS
+#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 
+#undef _ASM_X86_64_UNISTD_H_
+#include <asm/unistd_64.h>
+#undef __SYSCALL
+#define __SYSCALL(nr, sym) [ nr ] = sym, 
+#undef _ASM_X86_64_UNISTD_H_
+typedef void (*sys_call_ptr_t)(void); 
+extern void sys_ni_syscall(void);
+const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+        /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ 
+        [0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/unistd_64.h>
+};
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
new file mode 100644
index 000000000000..8344c70adf61
--- /dev/null
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -0,0 +1,326 @@
+ENTRY(sys_call_table)
+        .long sys_restart_syscall       /* 0 - old "setup()" system call, used for restarting */
+        .long sys_exit
+        .long sys_fork
+        .long sys_read
+        .long sys_write
+        .long sys_open          /* 5 */
+        .long sys_close
+        .long sys_waitpid
+        .long sys_creat
+        .long sys_link
+        .long sys_unlink        /* 10 */
+        .long sys_execve
+        .long sys_chdir
+        .long sys_time
+        .long sys_mknod
+        .long sys_chmod         /* 15 */
+        .long sys_lchown16
+        .long sys_ni_syscall    /* old break syscall holder */
+        .long sys_stat
+        .long sys_lseek
+        .long sys_getpid        /* 20 */
+        .long sys_mount
+        .long sys_oldumount
+        .long sys_setuid16
+        .long sys_getuid16
+        .long sys_stime         /* 25 */
+        .long sys_ptrace
+        .long sys_alarm
+        .long sys_fstat
+        .long sys_pause
+        .long sys_utime         /* 30 */
+        .long sys_ni_syscall    /* old stty syscall holder */
+        .long sys_ni_syscall    /* old gtty syscall holder */
+        .long sys_access
+        .long sys_nice
+        .long sys_ni_syscall    /* 35 - old ftime syscall holder */
+        .long sys_sync
+        .long sys_kill
+        .long sys_rename
+        .long sys_mkdir
+        .long sys_rmdir         /* 40 */
+        .long sys_dup
+        .long sys_pipe
+        .long sys_times
+        .long sys_ni_syscall    /* old prof syscall holder */
+        .long sys_brk           /* 45 */
+        .long sys_setgid16
+        .long sys_getgid16
+        .long sys_signal
+        .long sys_geteuid16
+        .long sys_getegid16     /* 50 */
+        .long sys_acct
+        .long sys_umount        /* recycled never used phys() */
+        .long sys_ni_syscall    /* old lock syscall holder */
+        .long sys_ioctl
+        .long sys_fcntl         /* 55 */
+        .long sys_ni_syscall    /* old mpx syscall holder */
+        .long sys_setpgid
+        .long sys_ni_syscall    /* old ulimit syscall holder */
+        .long sys_olduname
+        .long sys_umask         /* 60 */
+        .long sys_chroot
+        .long sys_ustat
+        .long sys_dup2
+        .long sys_getppid
+        .long sys_getpgrp       /* 65 */
+        .long sys_setsid
+        .long sys_sigaction
+        .long sys_sgetmask
+        .long sys_ssetmask
+        .long sys_setreuid16    /* 70 */
+        .long sys_setregid16
+        .long sys_sigsuspend
+        .long sys_sigpending
+        .long sys_sethostname
+        .long sys_setrlimit     /* 75 */
+        .long sys_old_getrlimit
+        .long sys_getrusage
+        .long sys_gettimeofday
+        .long sys_settimeofday
+        .long sys_getgroups16   /* 80 */
+        .long sys_setgroups16
+        .long old_select
+        .long sys_symlink
+        .long sys_lstat
+        .long sys_readlink      /* 85 */
+        .long sys_uselib
+        .long sys_swapon
+        .long sys_reboot
+        .long old_readdir
+        .long old_mmap          /* 90 */
+        .long sys_munmap
+        .long sys_truncate
+        .long sys_ftruncate
+        .long sys_fchmod
+        .long sys_fchown16      /* 95 */
+        .long sys_getpriority
+        .long sys_setpriority
+        .long sys_ni_syscall    /* old profil syscall holder */
+        .long sys_statfs
+        .long sys_fstatfs       /* 100 */
+        .long sys_ioperm
+        .long sys_socketcall
+        .long sys_syslog
+        .long sys_setitimer
+        .long sys_getitimer     /* 105 */
+        .long sys_newstat
+        .long sys_newlstat
+        .long sys_newfstat
+        .long sys_uname
+        .long sys_iopl          /* 110 */
+        .long sys_vhangup
+        .long sys_ni_syscall    /* old "idle" system call */
+        .long sys_vm86old
+        .long sys_wait4
+        .long sys_swapoff       /* 115 */
+        .long sys_sysinfo
+        .long sys_ipc
+        .long sys_fsync
+        .long sys_sigreturn
+        .long sys_clone         /* 120 */
+        .long sys_setdomainname
+        .long sys_newuname
+        .long sys_modify_ldt
+        .long sys_adjtimex
+        .long sys_mprotect      /* 125 */
+        .long sys_sigprocmask
+        .long sys_ni_syscall    /* old "create_module" */
+        .long sys_init_module
+        .long sys_delete_module
+        .long sys_ni_syscall    /* 130: old "get_kernel_syms" */
+        .long sys_quotactl
+        .long sys_getpgid
+        .long sys_fchdir
+        .long sys_bdflush
+        .long sys_sysfs         /* 135 */
+        .long sys_personality
+        .long sys_ni_syscall    /* reserved for afs_syscall */
+        .long sys_setfsuid16
+        .long sys_setfsgid16
+        .long sys_llseek        /* 140 */
+        .long sys_getdents
+        .long sys_select
+        .long sys_flock
+        .long sys_msync
+        .long sys_readv         /* 145 */
+        .long sys_writev
+        .long sys_getsid
+        .long sys_fdatasync
+        .long sys_sysctl
+        .long sys_mlock         /* 150 */
+        .long sys_munlock
+        .long sys_mlockall
+        .long sys_munlockall
+        .long sys_sched_setparam
+        .long sys_sched_getparam   /* 155 */
+        .long sys_sched_setscheduler
+        .long sys_sched_getscheduler
+        .long sys_sched_yield
+        .long sys_sched_get_priority_max
+        .long sys_sched_get_priority_min  /* 160 */
+        .long sys_sched_rr_get_interval
+        .long sys_nanosleep
+        .long sys_mremap
+        .long sys_setresuid16
+        .long sys_getresuid16   /* 165 */
+        .long sys_vm86
+        .long sys_ni_syscall    /* Old sys_query_module */
+        .long sys_poll
+        .long sys_nfsservctl
+        .long sys_setresgid16   /* 170 */
+        .long sys_getresgid16
+        .long sys_prctl
+        .long sys_rt_sigreturn
+        .long sys_rt_sigaction
+        .long sys_rt_sigprocmask        /* 175 */
+        .long sys_rt_sigpending
+        .long sys_rt_sigtimedwait
+        .long sys_rt_sigqueueinfo
+        .long sys_rt_sigsuspend
+        .long sys_pread64       /* 180 */
+        .long sys_pwrite64
+        .long sys_chown16
+        .long sys_getcwd
+        .long sys_capget
+        .long sys_capset        /* 185 */
+        .long sys_sigaltstack
+        .long sys_sendfile
+        .long sys_ni_syscall    /* reserved for streams1 */
+        .long sys_ni_syscall    /* reserved for streams2 */
+        .long sys_vfork         /* 190 */
+        .long sys_getrlimit
+        .long sys_mmap2
+        .long sys_truncate64
+        .long sys_ftruncate64
+        .long sys_stat64        /* 195 */
+        .long sys_lstat64
+        .long sys_fstat64
+        .long sys_lchown
+        .long sys_getuid
+        .long sys_getgid        /* 200 */
+        .long sys_geteuid
+        .long sys_getegid
+        .long sys_setreuid
+        .long sys_setregid
+        .long sys_getgroups     /* 205 */
+        .long sys_setgroups
+        .long sys_fchown
+        .long sys_setresuid
+        .long sys_getresuid
+        .long sys_setresgid     /* 210 */
+        .long sys_getresgid
+        .long sys_chown
+        .long sys_setuid
+        .long sys_setgid
+        .long sys_setfsuid      /* 215 */
+        .long sys_setfsgid
+        .long sys_pivot_root
+        .long sys_mincore
+        .long sys_madvise
+        .long sys_getdents64    /* 220 */
+        .long sys_fcntl64
+        .long sys_ni_syscall    /* reserved for TUX */
+        .long sys_ni_syscall
+        .long sys_gettid
+        .long sys_readahead     /* 225 */
+        .long sys_setxattr
+        .long sys_lsetxattr
+        .long sys_fsetxattr
+        .long sys_getxattr
+        .long sys_lgetxattr     /* 230 */
+        .long sys_fgetxattr
+        .long sys_listxattr
+        .long sys_llistxattr
+        .long sys_flistxattr
+        .long sys_removexattr   /* 235 */
+        .long sys_lremovexattr
+        .long sys_fremovexattr
+        .long sys_tkill
+        .long sys_sendfile64
+        .long sys_futex         /* 240 */
+        .long sys_sched_setaffinity
+        .long sys_sched_getaffinity
+        .long sys_set_thread_area
+        .long sys_get_thread_area
+        .long sys_io_setup      /* 245 */
+        .long sys_io_destroy
+        .long sys_io_getevents
+        .long sys_io_submit
+        .long sys_io_cancel
+        .long sys_fadvise64     /* 250 */
+        .long sys_ni_syscall
+        .long sys_exit_group
+        .long sys_lookup_dcookie
+        .long sys_epoll_create
+        .long sys_epoll_ctl     /* 255 */
+        .long sys_epoll_wait
+        .long sys_remap_file_pages
+        .long sys_set_tid_address
+        .long sys_timer_create
+        .long sys_timer_settime         /* 260 */
+        .long sys_timer_gettime
+        .long sys_timer_getoverrun
+        .long sys_timer_delete
+        .long sys_clock_settime
+        .long sys_clock_gettime         /* 265 */
+        .long sys_clock_getres
+        .long sys_clock_nanosleep
+        .long sys_statfs64
+        .long sys_fstatfs64
+        .long sys_tgkill        /* 270 */
+        .long sys_utimes
+        .long sys_fadvise64_64
+        .long sys_ni_syscall    /* sys_vserver */
+        .long sys_mbind
+        .long sys_get_mempolicy
+        .long sys_set_mempolicy
+        .long sys_mq_open
+        .long sys_mq_unlink
+        .long sys_mq_timedsend
+        .long sys_mq_timedreceive       /* 280 */
+        .long sys_mq_notify
+        .long sys_mq_getsetattr
+        .long sys_kexec_load
+        .long sys_waitid
+        .long sys_ni_syscall            /* 285 */ /* available */
+        .long sys_add_key
+        .long sys_request_key
+        .long sys_keyctl
+        .long sys_ioprio_set
+        .long sys_ioprio_get            /* 290 */
+        .long sys_inotify_init
+        .long sys_inotify_add_watch
+        .long sys_inotify_rm_watch
+        .long sys_migrate_pages
+        .long sys_openat                /* 295 */
+        .long sys_mkdirat
+        .long sys_mknodat
+        .long sys_fchownat
+        .long sys_futimesat
+        .long sys_fstatat64             /* 300 */
+        .long sys_unlinkat
+        .long sys_renameat
+        .long sys_linkat
+        .long sys_symlinkat
+        .long sys_readlinkat            /* 305 */
+        .long sys_fchmodat
+        .long sys_faccessat
+        .long sys_pselect6
+        .long sys_ppoll
+        .long sys_unshare               /* 310 */
+        .long sys_set_robust_list
+        .long sys_get_robust_list
+        .long sys_splice
+        .long sys_sync_file_range
+        .long sys_tee                   /* 315 */
+        .long sys_vmsplice
+        .long sys_move_pages
+        .long sys_getcpu
+        .long sys_epoll_pwait
+        .long sys_utimensat             /* 320 */
+        .long sys_signalfd
+        .long sys_timerfd
+        .long sys_eventfd
+        .long sys_fallocate
diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/kernel/sysenter_32.c
new file mode 100644
index 000000000000..4eb2e408764f
--- /dev/null
+++ b/arch/x86/kernel/sysenter_32.c
@@ -0,0 +1,348 @@
+/*
+ * linux/arch/i386/kernel/sysenter.c
+ *
+ * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This file contains the needed initializations to support sysenter.
+ */
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+#include <asm/pgtable.h>
+#include <asm/unistd.h>
+#include <asm/elf.h>
+#include <asm/tlbflush.h>
+enum {
+        VDSO_DISABLED = 0,
+        VDSO_ENABLED = 1,
+        VDSO_COMPAT = 2,
+};
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT    VDSO_COMPAT
+#else
+#define VDSO_DEFAULT    VDSO_ENABLED
+#endif
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
+EXPORT_SYMBOL_GPL(vdso_enabled);
+static int __init vdso_setup(char *s)
+{
+        vdso_enabled = simple_strtoul(s, NULL, 0);
+        return 1;
+}
+__setup("vdso=", vdso_setup);
+extern asmlinkage void sysenter_entry(void);
+static __init void reloc_symtab(Elf32_Ehdr *ehdr,
+                                unsigned offset, unsigned size)
+{
+        Elf32_Sym *sym = (void *)ehdr + offset;
+        unsigned nsym = size / sizeof(*sym);
+        unsigned i;
+        for(i = 0; i < nsym; i++, sym++) {
+                if (sym->st_shndx == SHN_UNDEF ||
+                    sym->st_shndx == SHN_ABS)
+                        continue;  /* skip */
+                if (sym->st_shndx > SHN_LORESERVE) {
+                        printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
+                               sym->st_shndx);
+                        continue;
+                }
+                switch(ELF_ST_TYPE(sym->st_info)) {
+                case STT_OBJECT:
+                case STT_FUNC:
+                case STT_SECTION:
+                case STT_FILE:
+                        sym->st_value += VDSO_HIGH_BASE;
+                }
+        }
+}
+static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
+{
+        Elf32_Dyn *dyn = (void *)ehdr + offset;
+        for(; dyn->d_tag != DT_NULL; dyn++)
+                switch(dyn->d_tag) {
+                case DT_PLTGOT:
+                case DT_HASH:
+                case DT_STRTAB:
+                case DT_SYMTAB:
+                case DT_RELA:
+                case DT_INIT:
+                case DT_FINI:
+                case DT_REL:
+                case DT_DEBUG:
+                case DT_JMPREL:
+                case DT_VERSYM:
+                case DT_VERDEF:
+                case DT_VERNEED:
+                case DT_ADDRRNGLO ... DT_ADDRRNGHI:
+                        /* definitely pointers needing relocation */
+                        dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+                        break;
+                case DT_ENCODING ... OLD_DT_LOOS-1:
+                case DT_LOOS ... DT_HIOS-1:
+                        /* Tags above DT_ENCODING are pointers if
+                           they're even */
+                        if (dyn->d_tag >= DT_ENCODING &&
+                            (dyn->d_tag & 1) == 0)
+                                dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+                        break;
+                case DT_VERDEFNUM:
+                case DT_VERNEEDNUM:
+                case DT_FLAGS_1:
+                case DT_RELACOUNT:
+                case DT_RELCOUNT:
+                case DT_VALRNGLO ... DT_VALRNGHI:
+                        /* definitely not pointers */
+                        break;
+                case OLD_DT_LOOS ... DT_LOOS-1:
+                case DT_HIOS ... DT_VALRNGLO-1:
+                default:
+                        if (dyn->d_tag > DT_ENCODING)
+                                printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
+                                       dyn->d_tag);
+                        break;
+                }
+}
+static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+{
+        Elf32_Phdr *phdr;
+        Elf32_Shdr *shdr;
+        int i;
+        BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
+               !elf_check_arch(ehdr) ||
+               ehdr->e_type != ET_DYN);
+        ehdr->e_entry += VDSO_HIGH_BASE;
+        /* rebase phdrs */
+        phdr = (void *)ehdr + ehdr->e_phoff;
+        for (i = 0; i < ehdr->e_phnum; i++) {
+                phdr[i].p_vaddr += VDSO_HIGH_BASE;
+                /* relocate dynamic stuff */
+                if (phdr[i].p_type == PT_DYNAMIC)
+                        reloc_dyn(ehdr, phdr[i].p_offset);
+        }
+        /* rebase sections */
+        shdr = (void *)ehdr + ehdr->e_shoff;
+        for(i = 0; i < ehdr->e_shnum; i++) {
+                if (!(shdr[i].sh_flags & SHF_ALLOC))
+                        continue;
+                shdr[i].sh_addr += VDSO_HIGH_BASE;
+                if (shdr[i].sh_type == SHT_SYMTAB ||
+                    shdr[i].sh_type == SHT_DYNSYM)
+                        reloc_symtab(ehdr, shdr[i].sh_offset,
+                                     shdr[i].sh_size);
+        }
+}
+void enable_sep_cpu(void)
+{
+        int cpu = get_cpu();
+        struct tss_struct *tss = &per_cpu(init_tss, cpu);
+        if (!boot_cpu_has(X86_FEATURE_SEP)) {
+                put_cpu();
+                return;
+        }
+        tss->x86_tss.ss1 = __KERNEL_CS;
+        tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
+        wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+        wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
+        wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
+        put_cpu();      
+}
+static struct vm_area_struct gate_vma;
+static int __init gate_vma_init(void)
+{
+        gate_vma.vm_mm = NULL;
+        gate_vma.vm_start = FIXADDR_USER_START;
+        gate_vma.vm_end = FIXADDR_USER_END;
+        gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+        gate_vma.vm_page_prot = __P101;
+        /*
+         * Make sure the vDSO gets into every core dump.
+         * Dumping its contents makes post-mortem fully interpretable later
+         * without matching up the same kernel and hardware config to see
+         * what PC values meant.
+         */
+        gate_vma.vm_flags |= VM_ALWAYSDUMP;
+        return 0;
+}
+/*
+ * These symbols are defined by vsyscall.o to mark the bounds
+ * of the ELF DSO images included therein.
+ */
+extern const char vsyscall_int80_start, vsyscall_int80_end;
+extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
+static struct page *syscall_pages[1];
+static void map_compat_vdso(int map)
+{
+        static int vdso_mapped;
+        if (map == vdso_mapped)
+                return;
+        vdso_mapped = map;
+        __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
+                     map ? PAGE_READONLY_EXEC : PAGE_NONE);
+        /* flush stray tlbs */
+        flush_tlb_all();
+}
+int __init sysenter_setup(void)
+{
+        void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
+        const void *vsyscall;
+        size_t vsyscall_len;
+        syscall_pages[0] = virt_to_page(syscall_page);
+        gate_vma_init();
+        printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
+        if (!boot_cpu_has(X86_FEATURE_SEP)) {
+                vsyscall = &vsyscall_int80_start;
+                vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
+        } else {
+                vsyscall = &vsyscall_sysenter_start;
+                vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
+        }
+        memcpy(syscall_page, vsyscall, vsyscall_len);
+        relocate_vdso(syscall_page);
+        return 0;
+}
+/* Defined in vsyscall-sysenter.S */
+extern void SYSENTER_RETURN;
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long addr;
+        int ret = 0;
+        bool compat;
+        down_write(&mm->mmap_sem);
+        /* Test compat mode once here, in case someone
+           changes it via sysctl */
+        compat = (vdso_enabled == VDSO_COMPAT);
+        map_compat_vdso(compat);
+        if (compat)
+                addr = VDSO_HIGH_BASE;
+        else {
+                addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+                if (IS_ERR_VALUE(addr)) {
+                        ret = addr;
+                        goto up_fail;
+                }
+                /*
+                 * MAYWRITE to allow gdb to COW and set breakpoints
+                 *
+                 * Make sure the vDSO gets into every core dump.
+                 * Dumping its contents makes post-mortem fully
+                 * interpretable later without matching up the same
+                 * kernel and hardware config to see what PC values
+                 * meant.
+                 */
+                ret = install_special_mapping(mm, addr, PAGE_SIZE,
+                                              VM_READ|VM_EXEC|
+                                              VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+                                              VM_ALWAYSDUMP,
+                                              syscall_pages);
+                if (ret)
+                        goto up_fail;
+        }
+        current->mm->context.vdso = (void *)addr;
+        current_thread_info()->sysenter_return =
+                (void *)VDSO_SYM(&SYSENTER_RETURN);
+  up_fail:
+        up_write(&mm->mmap_sem);
+        return ret;
+}
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+        if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+                return "[vdso]";
+        return NULL;
+}
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+        struct mm_struct *mm = tsk->mm;
+        /* Check to see if this task was created in compat vdso mode */
+        if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
+                return &gate_vma;
+        return NULL;
+}
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+        const struct vm_area_struct *vma = get_gate_vma(task);
+        return vma && addr >= vma->vm_start && addr < vma->vm_end;
+}
+int in_gate_area_no_task(unsigned long addr)
+{
+        return 0;
+}
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
new file mode 100644
index 000000000000..e3f2569b2c44
--- /dev/null
+++ b/arch/x86/kernel/tce_64.c
@@ -0,0 +1,189 @@
+/*
+ * This file manages the translation entries for the IBM Calgary IOMMU.
+ *
+ * Derived from arch/powerpc/platforms/pseries/iommu.c
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/bootmem.h>
+#include <asm/tce.h>
+#include <asm/calgary.h>
+#include <asm/proto.h>
+/* flush a tce at 'tceaddr' to main memory */
+static inline void flush_tce(void* tceaddr)
+{
+        /* a single tce can't cross a cache line */
+        if (cpu_has_clflush)
+                asm volatile("clflush (%0)" :: "r" (tceaddr));
+        else
+                asm volatile("wbinvd":::"memory");
+}
+void tce_build(struct iommu_table *tbl, unsigned long index,
+        unsigned int npages, unsigned long uaddr, int direction)
+{
+        u64* tp;
+        u64 t;
+        u64 rpn;
+        t = (1 << TCE_READ_SHIFT);
+        if (direction != DMA_TO_DEVICE)
+                t |= (1 << TCE_WRITE_SHIFT);
+        tp = ((u64*)tbl->it_base) + index;
+        while (npages--) {
+                rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
+                t &= ~TCE_RPN_MASK;
+                t |= (rpn << TCE_RPN_SHIFT);
+                *tp = cpu_to_be64(t);
+                flush_tce(tp);
+                uaddr += PAGE_SIZE;
+                tp++;
+        }
+}
+void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
+{
+        u64* tp;
+        tp  = ((u64*)tbl->it_base) + index;
+        while (npages--) {
+                *tp = cpu_to_be64(0);
+                flush_tce(tp);
+                tp++;
+        }
+}
+static inline unsigned int table_size_to_number_of_entries(unsigned char size)
+{
+        /*
+         * size is the order of the table, 0-7
+         * smallest table is 8K entries, so shift result by 13 to
+         * multiply by 8K
+         */
+        return (1 << size) << 13;
+}
+static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
+{
+        unsigned int bitmapsz;
+        unsigned long bmppages;
+        int ret;
+        tbl->it_busno = dev->bus->number;
+        /* set the tce table size - measured in entries */
+        tbl->it_size = table_size_to_number_of_entries(specified_table_size);
+        /*
+         * number of bytes needed for the bitmap size in number of
+         * entries; we need one bit per entry
+         */
+        bitmapsz = tbl->it_size / BITS_PER_BYTE;
+        bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
+        if (!bmppages) {
+                printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
+                ret = -ENOMEM;
+                goto done;
+        }
+        tbl->it_map = (unsigned long*)bmppages;
+        memset(tbl->it_map, 0, bitmapsz);
+        tbl->it_hint = 0;
+        spin_lock_init(&tbl->it_lock);
+        return 0;
+done:
+        return ret;
+}
+int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar)
+{
+        struct iommu_table *tbl;
+        int ret;
+        if (pci_iommu(dev->bus)) {
+                printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n",
+                       dev, pci_iommu(dev->bus));
+                BUG();
+        }
+        tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
+        if (!tbl) {
+                printk(KERN_ERR "Calgary: error allocating iommu_table\n");
+                ret = -ENOMEM;
+                goto done;
+        }
+        ret = tce_table_setparms(dev, tbl);
+        if (ret)
+                goto free_tbl;
+        tbl->bbar = bbar;
+        set_pci_iommu(dev->bus, tbl);
+        return 0;
+free_tbl:
+        kfree(tbl);
+done:
+        return ret;
+}
+void * __init alloc_tce_table(void)
+{
+        unsigned int size;
+        size = table_size_to_number_of_entries(specified_table_size);
+        size *= TCE_ENTRY_SIZE;
+        return __alloc_bootmem_low(size, size, 0);
+}
+void __init free_tce_table(void *tbl)
+{
+        unsigned int size;
+        if (!tbl)
+                return;
+        size = table_size_to_number_of_entries(specified_table_size);
+        size *= TCE_ENTRY_SIZE;
+        free_bootmem(__pa(tbl), size);
+}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
new file mode 100644
index 000000000000..19a6c678d02e
--- /dev/null
+++ b/arch/x86/kernel/time_32.c
@@ -0,0 +1,236 @@
+/*
+ *  linux/arch/i386/kernel/time.c
+ *
+ *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
+ *
+ * This file contains the PC-specific time handling details:
+ * reading the RTC at bootup, etc..
+ * 1994-07-02    Alan Modra
+ *      fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
+ * 1995-03-26    Markus Kuhn
+ *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
+ *      precision CMOS clock update
+ * 1996-05-03    Ingo Molnar
+ *      fixed time warps in do_[slow|fast]_gettimeoffset()
+ * 1997-09-10   Updated NTP code according to technical memorandum Jan '96
+ *              "A Kernel Model for Precision Timekeeping" by Dave Mills
+ * 1998-09-05    (Various)
+ *      More robust do_fast_gettimeoffset() algorithm implemented
+ *      (works with APM, Cyrix 6x86MX and Centaur C6),
+ *      monotonic gettimeofday() with fast_get_timeoffset(),
+ *      drift-proof precision TSC calibration on boot
+ *      (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
+ *      Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
+ *      ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
+ * 1998-12-16    Andrea Arcangeli
+ *      Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
+ *      because was not accounting lost_ticks.
+ * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
+ *      Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ *      serialize accesses to xtime/lost_ticks).
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/bcd.h>
+#include <linux/efi.h>
+#include <linux/mca.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/irq.h>
+#include <asm/msr.h>
+#include <asm/delay.h>
+#include <asm/mpspec.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/timer.h>
+#include <asm/time.h>
+#include "mach_time.h"
+#include <linux/timex.h>
+#include <asm/hpet.h>
+#include <asm/arch_hooks.h>
+#include "io_ports.h"
+#include <asm/i8259.h>
+#include "do_timer.h"
+unsigned int cpu_khz;   /* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+/*
+ * This is a special lock that is owned by the CPU and holds the index
+ * register we are working with.  It is required for NMI access to the
+ * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
+ */
+volatile unsigned long cmos_lock = 0;
+EXPORT_SYMBOL(cmos_lock);
+/* Routines for accessing the CMOS RAM/RTC. */
+unsigned char rtc_cmos_read(unsigned char addr)
+{
+        unsigned char val;
+        lock_cmos_prefix(addr);
+        outb_p(addr, RTC_PORT(0));
+        val = inb_p(RTC_PORT(1));
+        lock_cmos_suffix(addr);
+        return val;
+}
+EXPORT_SYMBOL(rtc_cmos_read);
+void rtc_cmos_write(unsigned char val, unsigned char addr)
+{
+        lock_cmos_prefix(addr);
+        outb_p(addr, RTC_PORT(0));
+        outb_p(val, RTC_PORT(1));
+        lock_cmos_suffix(addr);
+}
+EXPORT_SYMBOL(rtc_cmos_write);
+static int set_rtc_mmss(unsigned long nowtime)
+{
+        int retval;
+        unsigned long flags;
+        /* gets recalled with irq locally disabled */
+        /* XXX - does irqsave resolve this? -johnstul */
+        spin_lock_irqsave(&rtc_lock, flags);
+        retval = set_wallclock(nowtime);
+        spin_unlock_irqrestore(&rtc_lock, flags);
+        return retval;
+}
+int timer_ack;
+unsigned long profile_pc(struct pt_regs *regs)
+{
+        unsigned long pc = instruction_pointer(regs);
+#ifdef CONFIG_SMP
+        if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
+            in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
+                return *(unsigned long *)(regs->ebp + 4);
+#else
+                unsigned long *sp = (unsigned long *)&regs->esp;
+                /* Return address is either directly at stack pointer
+                   or above a saved eflags. Eflags has bits 22-31 zero,
+                   kernel addresses don't. */
+                if (sp[0] >> 22)
+                        return sp[0];
+                if (sp[1] >> 22)
+                        return sp[1];
+#endif
+        }
+#endif
+        return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+/*
+ * This is the same as the above, except we _also_ save the current
+ * Time Stamp Counter value at the time of the timer interrupt, so that
+ * we later on can estimate the time of day more exactly.
+ */
+irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+#ifdef CONFIG_X86_IO_APIC
+        if (timer_ack) {
+                /*
+                 * Subtle, when I/O APICs are used we have to ack timer IRQ
+                 * manually to reset the IRR bit for do_slow_gettimeoffset().
+                 * This will also deassert NMI lines for the watchdog if run
+                 * on an 82489DX-based system.
+                 */
+                spin_lock(&i8259A_lock);
+                outb(0x0c, PIC_MASTER_OCW3);
+                /* Ack the IRQ; AEOI will end it automatically. */
+                inb(PIC_MASTER_POLL);
+                spin_unlock(&i8259A_lock);
+        }
+#endif
+        do_timer_interrupt_hook();
+        if (MCA_bus) {
+                /* The PS/2 uses level-triggered interrupts.  You can't
+                turn them off, nor would you want to (any attempt to
+                enable edge-triggered interrupts usually gets intercepted by a
+                special hardware circuit).  Hence we have to acknowledge
+                the timer interrupt.  Through some incredibly stupid
+                design idea, the reset for IRQ 0 is done by setting the
+                high bit of the PPI port B (0x61).  Note that some PS/2s,
+                notably the 55SX, work fine if this is removed.  */
+                u8 irq_v = inb_p( 0x61 );       /* read the current state */
+                outb_p( irq_v|0x80, 0x61 );     /* reset the IRQ */
+        }
+        return IRQ_HANDLED;
+}
+/* not static: needed by APM */
+unsigned long read_persistent_clock(void)
+{
+        unsigned long retval;
+        unsigned long flags;
+        spin_lock_irqsave(&rtc_lock, flags);
+        retval = get_wallclock();
+        spin_unlock_irqrestore(&rtc_lock, flags);
+        return retval;
+}
+int update_persistent_clock(struct timespec now)
+{
+        return set_rtc_mmss(now.tv_sec);
+}
+extern void (*late_time_init)(void);
+/* Duplicate of time_init() below, with hpet_enable part added */
+void __init hpet_time_init(void)
+{
+        if (!hpet_enable())
+                setup_pit_timer();
+        time_init_hook();
+}
+/*
+ * This is called directly from init code; we must delay timer setup in the
+ * HPET case as we can't make the decision to turn on HPET this early in the
+ * boot process.
+ *
+ * The chosen time_init function will usually be hpet_time_init, above, but
+ * in the case of virtual hardware, an alternative function may be substituted.
+ */
+void __init time_init(void)
+{
+        tsc_init();
+        late_time_init = choose_time_init();
+}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
new file mode 100644
index 000000000000..6d48a4e826d9
--- /dev/null
+++ b/arch/x86/kernel/time_64.c
@@ -0,0 +1,447 @@
+/*
+ *  linux/arch/x86-64/kernel/time.c
+ *
+ *  "High Precision Event Timer" based timekeeping.
+ *
+ *  Copyright (c) 1991,1992,1995  Linus Torvalds
+ *  Copyright (c) 1994  Alan Modra
+ *  Copyright (c) 1995  Markus Kuhn
+ *  Copyright (c) 1996  Ingo Molnar
+ *  Copyright (c) 1998  Andrea Arcangeli
+ *  Copyright (c) 2002,2006  Vojtech Pavlik
+ *  Copyright (c) 2003  Andi Kleen
+ *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/time.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sysdev.h>
+#include <linux/bcd.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/kallsyms.h>
+#include <linux/acpi.h>
+#ifdef CONFIG_ACPI
+#include <acpi/achware.h>       /* for PM timer frequency */
+#include <acpi/acpi_bus.h>
+#endif
+#include <asm/8253pit.h>
+#include <asm/i8253.h>
+#include <asm/pgtable.h>
+#include <asm/vsyscall.h>
+#include <asm/timex.h>
+#include <asm/proto.h>
+#include <asm/hpet.h>
+#include <asm/sections.h>
+#include <linux/hpet.h>
+#include <asm/apic.h>
+#include <asm/hpet.h>
+#include <asm/mpspec.h>
+#include <asm/nmi.h>
+#include <asm/vgtod.h>
+static char *timename = NULL;
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+DEFINE_SPINLOCK(i8253_lock);
+EXPORT_SYMBOL(i8253_lock);
+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+unsigned long profile_pc(struct pt_regs *regs)
+{
+        unsigned long pc = instruction_pointer(regs);
+        /* Assume the lock function has either no stack frame or a copy
+           of eflags from PUSHF
+           Eflags always has bits 22 and up cleared unlike kernel addresses. */
+        if (!user_mode(regs) && in_lock_functions(pc)) {
+                unsigned long *sp = (unsigned long *)regs->rsp;
+                if (sp[0] >> 22)
+                        return sp[0];
+                if (sp[1] >> 22)
+                        return sp[1];
+        }
+        return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+/*
+ * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
+ * ms after the second nowtime has started, because when nowtime is written
+ * into the registers of the CMOS clock, it will jump to the next second
+ * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
+ * sheet for details.
+ */
+static int set_rtc_mmss(unsigned long nowtime)
+{
+        int retval = 0;
+        int real_seconds, real_minutes, cmos_minutes;
+        unsigned char control, freq_select;
+/*
+ * IRQs are disabled when we're called from the timer interrupt,
+ * no need for spin_lock_irqsave()
+ */
+        spin_lock(&rtc_lock);
+/*
+ * Tell the clock it's being set and stop it.
+ */
+        control = CMOS_READ(RTC_CONTROL);
+        CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
+        freq_select = CMOS_READ(RTC_FREQ_SELECT);
+        CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
+        cmos_minutes = CMOS_READ(RTC_MINUTES);
+                BCD_TO_BIN(cmos_minutes);
+/*
+ * since we're only adjusting minutes and seconds, don't interfere with hour
+ * overflow. This avoids messing with unknown time zones but requires your RTC
+ * not to be off by more than 15 minutes. Since we're calling it only when
+ * our clock is externally synchronized using NTP, this shouldn't be a problem.
+ */
+        real_seconds = nowtime % 60;
+        real_minutes = nowtime / 60;
+        if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
+                real_minutes += 30;             /* correct for half hour time zone */
+        real_minutes %= 60;
+        if (abs(real_minutes - cmos_minutes) >= 30) {
+                printk(KERN_WARNING "time.c: can't update CMOS clock "
+                       "from %d to %d\n", cmos_minutes, real_minutes);
+                retval = -1;
+        } else {
+                BIN_TO_BCD(real_seconds);
+                BIN_TO_BCD(real_minutes);
+                CMOS_WRITE(real_seconds, RTC_SECONDS);
+                CMOS_WRITE(real_minutes, RTC_MINUTES);
+        }
+/*
+ * The following flags have to be released exactly in this order, otherwise the
+ * DS12887 (popular MC146818A clone with integrated battery and quartz) will
+ * not reset the oscillator and will not update precisely 500 ms later. You
+ * won't find this mentioned in the Dallas Semiconductor data sheets, but who
+ * believes data sheets anyway ... -- Markus Kuhn
+ */
+        CMOS_WRITE(control, RTC_CONTROL);
+        CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
+        spin_unlock(&rtc_lock);
+        return retval;
+}
+int update_persistent_clock(struct timespec now)
+{
+        return set_rtc_mmss(now.tv_sec);
+}
+void main_timer_handler(void)
+{
+/*
+ * Here we are in the timer irq handler. We have irqs locally disabled (so we
+ * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
+ * on the other CPU, so we need a lock. We also need to lock the vsyscall
+ * variables, because both do_timer() and us change them -arca+vojtech
+ */
+        write_seqlock(&xtime_lock);
+/*
+ * Do the timer stuff.
+ */
+        do_timer(1);
+#ifndef CONFIG_SMP
+        update_process_times(user_mode(get_irq_regs()));
+#endif
+/*
+ * In the SMP case we use the local APIC timer interrupt to do the profiling,
+ * except when we simulate SMP mode on a uniprocessor system, in that case we
+ * have to call the local interrupt handler.
+ */
+        if (!using_apic_timer)
+                smp_local_timer_interrupt();
+        write_sequnlock(&xtime_lock);
+}
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+        if (apic_runs_main_timer > 1)
+                return IRQ_HANDLED;
+        main_timer_handler();
+        if (using_apic_timer)
+                smp_send_timer_broadcast_ipi();
+        return IRQ_HANDLED;
+}
+unsigned long read_persistent_clock(void)
+{
+        unsigned int year, mon, day, hour, min, sec;
+        unsigned long flags;
+        unsigned century = 0;
+        spin_lock_irqsave(&rtc_lock, flags);
+        do {
+                sec = CMOS_READ(RTC_SECONDS);
+                min = CMOS_READ(RTC_MINUTES);
+                hour = CMOS_READ(RTC_HOURS);
+                day = CMOS_READ(RTC_DAY_OF_MONTH);
+                mon = CMOS_READ(RTC_MONTH);
+                year = CMOS_READ(RTC_YEAR);
+#ifdef CONFIG_ACPI
+                if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+                                        acpi_gbl_FADT.century)
+                        century = CMOS_READ(acpi_gbl_FADT.century);
+#endif
+        } while (sec != CMOS_READ(RTC_SECONDS));
+        spin_unlock_irqrestore(&rtc_lock, flags);
+        /*
+         * We know that x86-64 always uses BCD format, no need to check the
+         * config register.
+         */
+        BCD_TO_BIN(sec);
+        BCD_TO_BIN(min);
+        BCD_TO_BIN(hour);
+        BCD_TO_BIN(day);
+        BCD_TO_BIN(mon);
+        BCD_TO_BIN(year);
+        if (century) {
+                BCD_TO_BIN(century);
+                year += century * 100;
+                printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
+        } else {
+                /*
+                 * x86-64 systems only exists since 2002.
+                 * This will work up to Dec 31, 2100
+                 */
+                year += 2000;
+        }
+        return mktime(year, mon, day, hour, min, sec);
+}
+/* calibrate_cpu is used on systems with fixed rate TSCs to determine
+ * processor frequency */
+#define TICK_COUNT 100000000
+static unsigned int __init tsc_calibrate_cpu_khz(void)
+{
+        int tsc_start, tsc_now;
+        int i, no_ctr_free;
+        unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
+        unsigned long flags;
+        for (i = 0; i < 4; i++)
+                if (avail_to_resrv_perfctr_nmi_bit(i))
+                        break;
+        no_ctr_free = (i == 4);
+        if (no_ctr_free) {
+                i = 3;
+                rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
+                wrmsrl(MSR_K7_EVNTSEL3, 0);
+                rdmsrl(MSR_K7_PERFCTR3, pmc3);
+        } else {
+                reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+                reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+        }
+        local_irq_save(flags);
+        /* start meauring cycles, incrementing from 0 */
+        wrmsrl(MSR_K7_PERFCTR0 + i, 0);
+        wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
+        rdtscl(tsc_start);
+        do {
+                rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
+                tsc_now = get_cycles_sync();
+        } while ((tsc_now - tsc_start) < TICK_COUNT);
+        local_irq_restore(flags);
+        if (no_ctr_free) {
+                wrmsrl(MSR_K7_EVNTSEL3, 0);
+                wrmsrl(MSR_K7_PERFCTR3, pmc3);
+                wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
+        } else {
+                release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+                release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+        }
+        return pmc_now * tsc_khz / (tsc_now - tsc_start);
+}
+/*
+ * pit_calibrate_tsc() uses the speaker output (channel 2) of
+ * the PIT. This is better than using the timer interrupt output,
+ * because we can read the value of the speaker with just one inb(),
+ * where we need three i/o operations for the interrupt channel.
+ * We count how many ticks the TSC does in 50 ms.
+ */
+static unsigned int __init pit_calibrate_tsc(void)
+{
+        unsigned long start, end;
+        unsigned long flags;
+        spin_lock_irqsave(&i8253_lock, flags);
+        outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+        outb(0xb0, 0x43);
+        outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
+        outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
+        start = get_cycles_sync();
+        while ((inb(0x61) & 0x20) == 0);
+        end = get_cycles_sync();
+        spin_unlock_irqrestore(&i8253_lock, flags);
+        return (end - start) / 50;
+}
+#define PIT_MODE 0x43
+#define PIT_CH0  0x40
+static void __pit_init(int val, u8 mode)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&i8253_lock, flags);
+        outb_p(mode, PIT_MODE);
+        outb_p(val & 0xff, PIT_CH0);    /* LSB */
+        outb_p(val >> 8, PIT_CH0);      /* MSB */
+        spin_unlock_irqrestore(&i8253_lock, flags);
+}
+void __init pit_init(void)
+{
+        __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */
+}
+void pit_stop_interrupt(void)
+{
+        __pit_init(0, 0x30); /* mode 0 */
+}
+void stop_timer_interrupt(void)
+{
+        char *name;
+        if (hpet_address) {
+                name = "HPET";
+                hpet_timer_stop_set_go(0);
+        } else {
+                name = "PIT";
+                pit_stop_interrupt();
+        }
+        printk(KERN_INFO "timer: %s interrupt stopped.\n", name);
+}
+static struct irqaction irq0 = {
+        .handler        = timer_interrupt,
+        .flags          = IRQF_DISABLED | IRQF_IRQPOLL,
+        .mask           = CPU_MASK_NONE,
+        .name           = "timer"
+};
+void __init time_init(void)
+{
+        if (nohpet)
+                hpet_address = 0;
+        if (hpet_arch_init())
+                hpet_address = 0;
+        if (hpet_use_timer) {
+                /* set tick_nsec to use the proper rate for HPET */
+                tick_nsec = TICK_NSEC_HPET;
+                tsc_khz = hpet_calibrate_tsc();
+                timename = "HPET";
+        } else {
+                pit_init();
+                tsc_khz = pit_calibrate_tsc();
+                timename = "PIT";
+        }
+        cpu_khz = tsc_khz;
+        if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
+                boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+                boot_cpu_data.x86 == 16)
+                cpu_khz = tsc_calibrate_cpu_khz();
+        if (unsynchronized_tsc())
+                mark_tsc_unstable("TSCs unsynchronized");
+        if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
+                vgetcpu_mode = VGETCPU_RDTSCP;
+        else
+                vgetcpu_mode = VGETCPU_LSL;
+        set_cyc2ns_scale(tsc_khz);
+        printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
+                cpu_khz / 1000, cpu_khz % 1000);
+        init_tsc_clocksource();
+        setup_irq(0, &irq0);
+}
+/*
+ * sysfs support for the timer.
+ */
+static int timer_suspend(struct sys_device *dev, pm_message_t state)
+{
+        return 0;
+}
+static int timer_resume(struct sys_device *dev)
+{
+        if (hpet_address)
+                hpet_reenable();
+        else
+                i8254_timer_resume();
+        return 0;
+}
+static struct sysdev_class timer_sysclass = {
+        .resume = timer_resume,
+        .suspend = timer_suspend,
+        set_kset_name("timer"),
+};
+/* XXX this sysfs stuff should probably go elsewhere later -john */
+static struct sys_device device_timer = {
+        .id     = 0,
+        .cls    = &timer_sysclass,
+};
+static int time_init_device(void)
+{
+        int error = sysdev_class_register(&timer_sysclass);
+        if (!error)
+                error = sysdev_register(&device_timer);
+        return error;
+}
+device_initcall(time_init_device);
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
new file mode 100644
index 000000000000..45782356a618
--- /dev/null
+++ b/arch/x86/kernel/topology.c
@@ -0,0 +1,77 @@
+/*
+ * arch/i386/kernel/topology.c - Populate sysfs with topology information
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/nodemask.h>
+#include <linux/mmzone.h>
+#include <asm/cpu.h>
+static struct i386_cpu cpu_devices[NR_CPUS];
+int arch_register_cpu(int num)
+{
+        /*
+         * CPU0 cannot be offlined due to several
+         * restrictions and assumptions in kernel. This basically
+         * doesnt add a control file, one cannot attempt to offline
+         * BSP.
+         *
+         * Also certain PCI quirks require not to enable hotplug control
+         * for all CPU's.
+         */
+        if (num && enable_cpu_hotplug)
+                cpu_devices[num].cpu.hotpluggable = 1;
+        return register_cpu(&cpu_devices[num].cpu, num);
+}
+#ifdef CONFIG_HOTPLUG_CPU
+int enable_cpu_hotplug = 1;
+void arch_unregister_cpu(int num) {
+        return unregister_cpu(&cpu_devices[num].cpu);
+}
+EXPORT_SYMBOL(arch_register_cpu);
+EXPORT_SYMBOL(arch_unregister_cpu);
+#endif /*CONFIG_HOTPLUG_CPU*/
+static int __init topology_init(void)
+{
+        int i;
+#ifdef CONFIG_NUMA
+        for_each_online_node(i)
+                register_one_node(i);
+#endif /* CONFIG_NUMA */
+        for_each_present_cpu(i)
+                arch_register_cpu(i);
+        return 0;
+}
+subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
new file mode 100644
index 000000000000..f62815f8d06a
--- /dev/null
+++ b/arch/x86/kernel/trampoline_32.S
@@ -0,0 +1,85 @@
+/*
+ *
+ *      Trampoline.S    Derived from Setup.S by Linus Torvalds
+ *
+ *      4 Jan 1997 Michael Chastain: changed to gnu as.
+ *
+ *      This is only used for booting secondary CPUs in SMP machine
+ *
+ *      Entry: CS:IP point to the start of our code, we are 
+ *      in real mode with no stack, but the rest of the 
+ *      trampoline page to make our stack and everything else
+ *      is a mystery.
+ *
+ *      In fact we don't actually need a stack so we don't
+ *      set one up.
+ *
+ *      We jump into the boot/compressed/head.S code. So you'd
+ *      better be running a compressed kernel image or you
+ *      won't get very far.
+ *
+ *      On entry to trampoline_data, the processor is in real mode
+ *      with 16-bit addressing and 16-bit data.  CS has some value
+ *      and IP is zero.  Thus, data addresses need to be absolute
+ *      (no relocation) and are taken with regard to r_base.
+ *
+ *      If you work on this file, check the object module with
+ *      objdump --reloc to make sure there are no relocation
+ *      entries except for:
+ *
+ *      TYPE              VALUE
+ *      R_386_32          startup_32_smp
+ *      R_386_32          boot_gdt
+ */
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+.data
+/* We can free up trampoline after bootup if cpu hotplug is not supported. */
+#ifndef CONFIG_HOTPLUG_CPU
+.section ".init.data","aw",@progbits
+#endif
+.code16
+ENTRY(trampoline_data)
+r_base = .
+        wbinvd                  # Needed for NUMA-Q should be harmless for others
+        mov     %cs, %ax        # Code and data in the same place
+        mov     %ax, %ds
+        cli                     # We should be safe anyway
+        movl    $0xA5A5A5A5, trampoline_data - r_base
+                                # write marker for master knows we're running
+        /* GDT tables in non default location kernel can be beyond 16MB and
+         * lgdt will not be able to load the address as in real mode default
+         * operand size is 16bit. Use lgdtl instead to force operand size
+         * to 32 bit.
+         */
+        lidtl   boot_idt_descr - r_base # load idt with 0, 0
+        lgdtl   boot_gdt_descr - r_base # load gdt with whatever is appropriate
+        xor     %ax, %ax
+        inc     %ax             # protected mode (PE) bit
+        lmsw    %ax             # into protected mode
+        # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
+        ljmpl   $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
+        # These need to be in the same 64K segment as the above;
+        # hence we don't use the boot_gdt_descr defined in head.S
+boot_gdt_descr:
+        .word   __BOOT_DS + 7                   # gdt limit
+        .long   boot_gdt - __PAGE_OFFSET        # gdt base
+boot_idt_descr:
+        .word   0                               # idt limit = 0
+        .long   0                               # idt base = 0L
+.globl trampoline_end
+trampoline_end:
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
new file mode 100644
index 000000000000..607983b0d27b
--- /dev/null
+++ b/arch/x86/kernel/trampoline_64.S
@@ -0,0 +1,166 @@
+/*
+ *
+ *      Trampoline.S    Derived from Setup.S by Linus Torvalds
+ *
+ *      4 Jan 1997 Michael Chastain: changed to gnu as.
+ *      15 Sept 2005 Eric Biederman: 64bit PIC support
+ *
+ *      Entry: CS:IP point to the start of our code, we are 
+ *      in real mode with no stack, but the rest of the 
+ *      trampoline page to make our stack and everything else
+ *      is a mystery.
+ *
+ *      In fact we don't actually need a stack so we don't
+ *      set one up.
+ *
+ *      On entry to trampoline_data, the processor is in real mode
+ *      with 16-bit addressing and 16-bit data.  CS has some value
+ *      and IP is zero.  Thus, data addresses need to be absolute
+ *      (no relocation) and are taken with regard to r_base.
+ *
+ *      With the addition of trampoline_level4_pgt this code can
+ *      now enter a 64bit kernel that lives at arbitrary 64bit
+ *      physical addresses.
+ *
+ *      If you work on this file, check the object module with objdump
+ *      --full-contents --reloc to make sure there are no relocation
+ *      entries.
+ */
+#include <linux/linkage.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/segment.h>
+.data
+.code16
+ENTRY(trampoline_data)
+r_base = .
+        cli                     # We should be safe anyway
+        wbinvd  
+        mov     %cs, %ax        # Code and data in the same place
+        mov     %ax, %ds
+        mov     %ax, %es
+        mov     %ax, %ss
+        movl    $0xA5A5A5A5, trampoline_data - r_base
+                                # write marker for master knows we're running
+                                        # Setup stack
+        movw    $(trampoline_stack_end - r_base), %sp
+        call    verify_cpu              # Verify the cpu supports long mode
+        testl   %eax, %eax              # Check for return code
+        jnz     no_longmode
+        mov     %cs, %ax
+        movzx   %ax, %esi               # Find the 32bit trampoline location
+        shll    $4, %esi
+                                        # Fixup the vectors
+        addl    %esi, startup_32_vector - r_base
+        addl    %esi, startup_64_vector - r_base
+        addl    %esi, tgdt + 2 - r_base # Fixup the gdt pointer
+        /*
+         * GDT tables in non default location kernel can be beyond 16MB and
+         * lgdt will not be able to load the address as in real mode default
+         * operand size is 16bit. Use lgdtl instead to force operand size
+         * to 32 bit.
+         */
+        lidtl   tidt - r_base   # load idt with 0, 0
+        lgdtl   tgdt - r_base   # load gdt with whatever is appropriate
+        xor     %ax, %ax
+        inc     %ax             # protected mode (PE) bit
+        lmsw    %ax             # into protected mode
+        # flush prefetch and jump to startup_32
+        ljmpl   *(startup_32_vector - r_base)
+        .code32
+        .balign 4
+startup_32:
+        movl    $__KERNEL_DS, %eax      # Initialize the %ds segment register
+        movl    %eax, %ds
+        xorl    %eax, %eax
+        btsl    $5, %eax                # Enable PAE mode
+        movl    %eax, %cr4
+                                        # Setup trampoline 4 level pagetables
+        leal    (trampoline_level4_pgt - r_base)(%esi), %eax
+        movl    %eax, %cr3
+        movl    $MSR_EFER, %ecx
+        movl    $(1 << _EFER_LME), %eax # Enable Long Mode
+        xorl    %edx, %edx
+        wrmsr
+        xorl    %eax, %eax
+        btsl    $31, %eax               # Enable paging and in turn activate Long Mode
+        btsl    $0, %eax                # Enable protected mode
+        movl    %eax, %cr0
+        /*
+         * At this point we're in long mode but in 32bit compatibility mode
+         * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
+         * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
+         * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+         */
+        ljmp    *(startup_64_vector - r_base)(%esi)
+        .code64
+        .balign 4
+startup_64:
+        # Now jump into the kernel using virtual addresses
+        movq    $secondary_startup_64, %rax
+        jmp     *%rax
+        .code16
+no_longmode:
+        hlt
+        jmp no_longmode
+#include "verify_cpu_64.S"
+        # Careful these need to be in the same 64K segment as the above;
+tidt:
+        .word   0                       # idt limit = 0
+        .word   0, 0                    # idt base = 0L
+        # Duplicate the global descriptor table
+        # so the kernel can live anywhere
+        .balign 4
+tgdt:
+        .short  tgdt_end - tgdt         # gdt limit
+        .long   tgdt - r_base
+        .short 0
+        .quad   0x00cf9b000000ffff      # __KERNEL32_CS
+        .quad   0x00af9b000000ffff      # __KERNEL_CS
+        .quad   0x00cf93000000ffff      # __KERNEL_DS
+tgdt_end:
+        .balign 4
+startup_32_vector:
+        .long   startup_32 - r_base
+        .word   __KERNEL32_CS, 0
+        .balign 4
+startup_64_vector:
+        .long   startup_64 - r_base
+        .word   __KERNEL_CS, 0
+trampoline_stack:
+        .org 0x1000
+trampoline_stack_end:
+ENTRY(trampoline_level4_pgt)
+        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+        .fill   510,8,0
+        .quad   level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ENTRY(trampoline_end)
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
new file mode 100644
index 000000000000..47b0bef335bd
--- /dev/null
+++ b/arch/x86/kernel/traps_32.c
@@ -0,0 +1,1250 @@
+/*
+ *  linux/arch/i386/traps.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *      Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+/*
+ * 'Traps.c' handles hardware traps and faults after we have saved some
+ * state in 'asm.s'.
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/highmem.h>
+#include <linux/kallsyms.h>
+#include <linux/ptrace.h>
+#include <linux/utsname.h>
+#include <linux/kprobes.h>
+#include <linux/kexec.h>
+#include <linux/unwind.h>
+#include <linux/uaccess.h>
+#include <linux/nmi.h>
+#include <linux/bug.h>
+#ifdef CONFIG_EISA
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#endif
+#ifdef CONFIG_MCA
+#include <linux/mca.h>
+#endif
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/debugreg.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/nmi.h>
+#include <asm/unwind.h>
+#include <asm/smp.h>
+#include <asm/arch_hooks.h>
+#include <linux/kdebug.h>
+#include <asm/stacktrace.h>
+#include <linux/module.h>
+#include "mach_traps.h"
+int panic_on_unrecovered_nmi;
+asmlinkage int system_call(void);
+/* Do we ignore FPU interrupts ? */
+char ignore_fpu_irq = 0;
+/*
+ * The IDT has to be page-aligned to simplify the Pentium
+ * F0 0F bug workaround.. We have a special link segment
+ * for this.
+ */
+struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void simd_coprocessor_error(void);
+asmlinkage void alignment_check(void);
+asmlinkage void spurious_interrupt_bug(void);
+asmlinkage void machine_check(void);
+int kstack_depth_to_print = 24;
+static unsigned int code_bytes = 64;
+static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
+{
+        return  p > (void *)tinfo &&
+                p <= (void *)tinfo + THREAD_SIZE - size;
+}
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+        struct stack_frame *next_frame;
+        unsigned long return_address;
+};
+static inline unsigned long print_context_stack(struct thread_info *tinfo,
+                                unsigned long *stack, unsigned long ebp,
+                                struct stacktrace_ops *ops, void *data)
+{
+#ifdef  CONFIG_FRAME_POINTER
+        struct stack_frame *frame = (struct stack_frame *)ebp;
+        while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
+                struct stack_frame *next;
+                unsigned long addr;
+                addr = frame->return_address;
+                ops->address(data, addr);
+                /*
+                 * break out of recursive entries (such as
+                 * end_of_stack_stop_unwind_function). Also,
+                 * we can never allow a frame pointer to
+                 * move downwards!
+                 */
+                next = frame->next_frame;
+                if (next <= frame)
+                        break;
+                frame = next;
+        }
+#else
+        while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
+                unsigned long addr;
+                addr = *stack++;
+                if (__kernel_text_address(addr))
+                        ops->address(data, addr);
+        }
+#endif
+        return ebp;
+}
+#define MSG(msg) ops->warning(data, msg)
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
+                unsigned long *stack,
+                struct stacktrace_ops *ops, void *data)
+{
+        unsigned long ebp = 0;
+        if (!task)
+                task = current;
+        if (!stack) {
+                unsigned long dummy;
+                stack = &dummy;
+                if (task != current)
+                        stack = (unsigned long *)task->thread.esp;
+        }
+#ifdef CONFIG_FRAME_POINTER
+        if (!ebp) {
+                if (task == current) {
+                        /* Grab ebp right from our regs */
+                        asm ("movl %%ebp, %0" : "=r" (ebp) : );
+                } else {
+                        /* ebp is the last reg pushed by switch_to */
+                        ebp = *(unsigned long *) task->thread.esp;
+                }
+        }
+#endif
+        while (1) {
+                struct thread_info *context;
+                context = (struct thread_info *)
+                        ((unsigned long)stack & (~(THREAD_SIZE - 1)));
+                ebp = print_context_stack(context, stack, ebp, ops, data);
+                /* Should be after the line below, but somewhere
+                   in early boot context comes out corrupted and we
+                   can't reference it -AK */
+                if (ops->stack(data, "IRQ") < 0)
+                        break;
+                stack = (unsigned long*)context->previous_esp;
+                if (!stack)
+                        break;
+                touch_nmi_watchdog();
+        }
+}
+EXPORT_SYMBOL(dump_trace);
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+        printk(data);
+        print_symbol(msg, symbol);
+        printk("\n");
+}
+static void print_trace_warning(void *data, char *msg)
+{
+        printk("%s%s\n", (char *)data, msg);
+}
+static int print_trace_stack(void *data, char *name)
+{
+        return 0;
+}
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr)
+{
+        printk("%s [<%08lx>] ", (char *)data, addr);
+        print_symbol("%s\n", addr);
+        touch_nmi_watchdog();
+}
+static struct stacktrace_ops print_trace_ops = {
+        .warning = print_trace_warning,
+        .warning_symbol = print_trace_warning_symbol,
+        .stack = print_trace_stack,
+        .address = print_trace_address,
+};
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+                   unsigned long * stack, char *log_lvl)
+{
+        dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
+        printk("%s =======================\n", log_lvl);
+}
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+                unsigned long * stack)
+{
+        show_trace_log_lvl(task, regs, stack, "");
+}
+static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+                               unsigned long *esp, char *log_lvl)
+{
+        unsigned long *stack;
+        int i;
+        if (esp == NULL) {
+                if (task)
+                        esp = (unsigned long*)task->thread.esp;
+                else
+                        esp = (unsigned long *)&esp;
+        }
+        stack = esp;
+        for(i = 0; i < kstack_depth_to_print; i++) {
+                if (kstack_end(stack))
+                        break;
+                if (i && ((i % 8) == 0))
+                        printk("\n%s       ", log_lvl);
+                printk("%08lx ", *stack++);
+        }
+        printk("\n%sCall Trace:\n", log_lvl);
+        show_trace_log_lvl(task, regs, esp, log_lvl);
+}
+void show_stack(struct task_struct *task, unsigned long *esp)
+{
+        printk("       ");
+        show_stack_log_lvl(task, NULL, esp, "");
+}
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+        unsigned long stack;
+        show_trace(current, NULL, &stack);
+}
+EXPORT_SYMBOL(dump_stack);
+void show_registers(struct pt_regs *regs)
+{
+        int i;
+        int in_kernel = 1;
+        unsigned long esp;
+        unsigned short ss, gs;
+        esp = (unsigned long) (&regs->esp);
+        savesegment(ss, ss);
+        savesegment(gs, gs);
+        if (user_mode_vm(regs)) {
+                in_kernel = 0;
+                esp = regs->esp;
+                ss = regs->xss & 0xffff;
+        }
+        print_modules();
+        printk(KERN_EMERG "CPU:    %d\n"
+                KERN_EMERG "EIP:    %04x:[<%08lx>]    %s VLI\n"
+                KERN_EMERG "EFLAGS: %08lx   (%s %.*s)\n",
+                smp_processor_id(), 0xffff & regs->xcs, regs->eip,
+                print_tainted(), regs->eflags, init_utsname()->release,
+                (int)strcspn(init_utsname()->version, " "),
+                init_utsname()->version);
+        print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
+        printk(KERN_EMERG "eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+                regs->eax, regs->ebx, regs->ecx, regs->edx);
+        printk(KERN_EMERG "esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+                regs->esi, regs->edi, regs->ebp, esp);
+        printk(KERN_EMERG "ds: %04x   es: %04x   fs: %04x  gs: %04x  ss: %04x\n",
+               regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
+        printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
+                TASK_COMM_LEN, current->comm, current->pid,
+                current_thread_info(), current, task_thread_info(current));
+        /*
+         * When in-kernel, we also print out the stack and code at the
+         * time of the fault..
+         */
+        if (in_kernel) {
+                u8 *eip;
+                unsigned int code_prologue = code_bytes * 43 / 64;
+                unsigned int code_len = code_bytes;
+                unsigned char c;
+                printk("\n" KERN_EMERG "Stack: ");
+                show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
+                printk(KERN_EMERG "Code: ");
+                eip = (u8 *)regs->eip - code_prologue;
+                if (eip < (u8 *)PAGE_OFFSET ||
+                        probe_kernel_address(eip, c)) {
+                        /* try starting at EIP */
+                        eip = (u8 *)regs->eip;
+                        code_len = code_len - code_prologue + 1;
+                }
+                for (i = 0; i < code_len; i++, eip++) {
+                        if (eip < (u8 *)PAGE_OFFSET ||
+                                probe_kernel_address(eip, c)) {
+                                printk(" Bad EIP value.");
+                                break;
+                        }
+                        if (eip == (u8 *)regs->eip)
+                                printk("<%02x> ", c);
+                        else
+                                printk("%02x ", c);
+                }
+        }
+        printk("\n");
+}       
+int is_valid_bugaddr(unsigned long eip)
+{
+        unsigned short ud2;
+        if (eip < PAGE_OFFSET)
+                return 0;
+        if (probe_kernel_address((unsigned short *)eip, ud2))
+                return 0;
+        return ud2 == 0x0b0f;
+}
+/*
+ * This is gone through when something in the kernel has done something bad and
+ * is about to be terminated.
+ */
+void die(const char * str, struct pt_regs * regs, long err)
+{
+        static struct {
+                spinlock_t lock;
+                u32 lock_owner;
+                int lock_owner_depth;
+        } die = {
+                .lock =                 __SPIN_LOCK_UNLOCKED(die.lock),
+                .lock_owner =           -1,
+                .lock_owner_depth =     0
+        };
+        static int die_counter;
+        unsigned long flags;
+        oops_enter();
+        if (die.lock_owner != raw_smp_processor_id()) {
+                console_verbose();
+                spin_lock_irqsave(&die.lock, flags);
+                die.lock_owner = smp_processor_id();
+                die.lock_owner_depth = 0;
+                bust_spinlocks(1);
+        }
+        else
+                local_save_flags(flags);
+        if (++die.lock_owner_depth < 3) {
+                int nl = 0;
+                unsigned long esp;
+                unsigned short ss;
+                report_bug(regs->eip, regs);
+                printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+                printk(KERN_EMERG "PREEMPT ");
+                nl = 1;
+#endif
+#ifdef CONFIG_SMP
+                if (!nl)
+                        printk(KERN_EMERG);
+                printk("SMP ");
+                nl = 1;
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+                if (!nl)
+                        printk(KERN_EMERG);
+                printk("DEBUG_PAGEALLOC");
+                nl = 1;
+#endif
+                if (nl)
+                        printk("\n");
+                if (notify_die(DIE_OOPS, str, regs, err,
+                                        current->thread.trap_no, SIGSEGV) !=
+                                NOTIFY_STOP) {
+                        show_registers(regs);
+                        /* Executive summary in case the oops scrolled away */
+                        esp = (unsigned long) (&regs->esp);
+                        savesegment(ss, ss);
+                        if (user_mode(regs)) {
+                                esp = regs->esp;
+                                ss = regs->xss & 0xffff;
+                        }
+                        printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
+                        print_symbol("%s", regs->eip);
+                        printk(" SS:ESP %04x:%08lx\n", ss, esp);
+                }
+                else
+                        regs = NULL;
+        } else
+                printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
+        bust_spinlocks(0);
+        die.lock_owner = -1;
+        add_taint(TAINT_DIE);
+        spin_unlock_irqrestore(&die.lock, flags);
+        if (!regs)
+                return;
+        if (kexec_should_crash(current))
+                crash_kexec(regs);
+        if (in_interrupt())
+                panic("Fatal exception in interrupt");
+        if (panic_on_oops)
+                panic("Fatal exception");
+        oops_exit();
+        do_exit(SIGSEGV);
+}
+static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
+{
+        if (!user_mode_vm(regs))
+                die(str, regs, err);
+}
+static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
+                              struct pt_regs * regs, long error_code,
+                              siginfo_t *info)
+{
+        struct task_struct *tsk = current;
+        if (regs->eflags & VM_MASK) {
+                if (vm86)
+                        goto vm86_trap;
+                goto trap_signal;
+        }
+        if (!user_mode(regs))
+                goto kernel_trap;
+        trap_signal: {
+                /*
+                 * We want error_code and trap_no set for userspace faults and
+                 * kernelspace faults which result in die(), but not
+                 * kernelspace faults which are fixed up.  die() gives the
+                 * process no chance to handle the signal and notice the
+                 * kernel fault information, so that won't result in polluting
+                 * the information about previously queued, but not yet
+                 * delivered, faults.  See also do_general_protection below.
+                 */
+                tsk->thread.error_code = error_code;
+                tsk->thread.trap_no = trapnr;
+                if (info)
+                        force_sig_info(signr, info, tsk);
+                else
+                        force_sig(signr, tsk);
+                return;
+        }
+        kernel_trap: {
+                if (!fixup_exception(regs)) {
+                        tsk->thread.error_code = error_code;
+                        tsk->thread.trap_no = trapnr;
+                        die(str, regs, error_code);
+                }
+                return;
+        }
+        vm86_trap: {
+                int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
+                if (ret) goto trap_signal;
+                return;
+        }
+}
+#define DO_ERROR(trapnr, signr, str, name) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                                == NOTIFY_STOP) \
+                return; \
+        do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
+}
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+        siginfo_t info; \
+        if (irq) \
+                local_irq_enable(); \
+        info.si_signo = signr; \
+        info.si_errno = 0; \
+        info.si_code = sicode; \
+        info.si_addr = (void __user *)siaddr; \
+        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                                == NOTIFY_STOP) \
+                return; \
+        do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
+}
+#define DO_VM86_ERROR(trapnr, signr, str, name) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                                == NOTIFY_STOP) \
+                return; \
+        do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
+}
+#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+        siginfo_t info; \
+        info.si_signo = signr; \
+        info.si_errno = 0; \
+        info.si_code = sicode; \
+        info.si_addr = (void __user *)siaddr; \
+        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                                == NOTIFY_STOP) \
+                return; \
+        do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
+}
+DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
+#ifndef CONFIG_KPROBES
+DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
+#endif
+DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
+DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
+DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
+DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
+DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
+fastcall void __kprobes do_general_protection(struct pt_regs * regs,
+                                              long error_code)
+{
+        int cpu = get_cpu();
+        struct tss_struct *tss = &per_cpu(init_tss, cpu);
+        struct thread_struct *thread = &current->thread;
+        /*
+         * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
+         * invalid offset set (the LAZY one) and the faulting thread has
+         * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
+         * and we set the offset field correctly. Then we let the CPU to
+         * restart the faulting instruction.
+         */
+        if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
+            thread->io_bitmap_ptr) {
+                memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
+                       thread->io_bitmap_max);
+                /*
+                 * If the previously set map was extending to higher ports
+                 * than the current one, pad extra space with 0xff (no access).
+                 */
+                if (thread->io_bitmap_max < tss->io_bitmap_max)
+                        memset((char *) tss->io_bitmap +
+                                thread->io_bitmap_max, 0xff,
+                                tss->io_bitmap_max - thread->io_bitmap_max);
+                tss->io_bitmap_max = thread->io_bitmap_max;
+                tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+                tss->io_bitmap_owner = thread;
+                put_cpu();
+                return;
+        }
+        put_cpu();
+        if (regs->eflags & VM_MASK)
+                goto gp_in_vm86;
+        if (!user_mode(regs))
+                goto gp_in_kernel;
+        current->thread.error_code = error_code;
+        current->thread.trap_no = 13;
+        if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
+            printk_ratelimit())
+                printk(KERN_INFO
+                    "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
+                    current->comm, current->pid,
+                    regs->eip, regs->esp, error_code);
+        force_sig(SIGSEGV, current);
+        return;
+gp_in_vm86:
+        local_irq_enable();
+        handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+        return;
+gp_in_kernel:
+        if (!fixup_exception(regs)) {
+                current->thread.error_code = error_code;
+                current->thread.trap_no = 13;
+                if (notify_die(DIE_GPF, "general protection fault", regs,
+                                error_code, 13, SIGSEGV) == NOTIFY_STOP)
+                        return;
+                die("general protection fault", regs, error_code);
+        }
+}
+static __kprobes void
+mem_parity_error(unsigned char reason, struct pt_regs * regs)
+{
+        printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+                "CPU %d.\n", reason, smp_processor_id());
+        printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
+#if defined(CONFIG_EDAC)
+        if(edac_handler_set()) {
+                edac_atomic_assert_error();
+                return;
+        }
+#endif
+        if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+        printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+        /* Clear and disable the memory parity error line. */
+        clear_mem_error(reason);
+}
+static __kprobes void
+io_check_error(unsigned char reason, struct pt_regs * regs)
+{
+        unsigned long i;
+        printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+        show_registers(regs);
+        /* Re-enable the IOCK line, wait for a few seconds */
+        reason = (reason & 0xf) | 8;
+        outb(reason, 0x61);
+        i = 2000;
+        while (--i) udelay(1000);
+        reason &= ~8;
+        outb(reason, 0x61);
+}
+static __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+{
+#ifdef CONFIG_MCA
+        /* Might actually be able to figure out what the guilty party
+        * is. */
+        if( MCA_bus ) {
+                mca_handle_nmi();
+                return;
+        }
+#endif
+        printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+                "CPU %d.\n", reason, smp_processor_id());
+        printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+        if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+        printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+}
+static DEFINE_SPINLOCK(nmi_print_lock);
+void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+{
+        if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
+            NOTIFY_STOP)
+                return;
+        spin_lock(&nmi_print_lock);
+        /*
+        * We are in trouble anyway, lets at least try
+        * to get a message out.
+        */
+        bust_spinlocks(1);
+        printk(KERN_EMERG "%s", msg);
+        printk(" on CPU%d, eip %08lx, registers:\n",
+                smp_processor_id(), regs->eip);
+        show_registers(regs);
+        console_silent();
+        spin_unlock(&nmi_print_lock);
+        bust_spinlocks(0);
+        /* If we are in kernel we are probably nested up pretty bad
+         * and might aswell get out now while we still can.
+        */
+        if (!user_mode_vm(regs)) {
+                current->thread.trap_no = 2;
+                crash_kexec(regs);
+        }
+        do_exit(SIGSEGV);
+}
+static __kprobes void default_do_nmi(struct pt_regs * regs)
+{
+        unsigned char reason = 0;
+        /* Only the BSP gets external NMIs from the system.  */
+        if (!smp_processor_id())
+                reason = get_nmi_reason();
+ 
+        if (!(reason & 0xc0)) {
+                if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
+                                                        == NOTIFY_STOP)
+                        return;
+#ifdef CONFIG_X86_LOCAL_APIC
+                /*
+                 * Ok, so this is none of the documented NMI sources,
+                 * so it must be the NMI watchdog.
+                 */
+                if (nmi_watchdog_tick(regs, reason))
+                        return;
+                if (!do_nmi_callback(regs, smp_processor_id()))
+#endif
+                        unknown_nmi_error(reason, regs);
+                return;
+        }
+        if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+                return;
+        if (reason & 0x80)
+                mem_parity_error(reason, regs);
+        if (reason & 0x40)
+                io_check_error(reason, regs);
+        /*
+         * Reassert NMI in case it became active meanwhile
+         * as it's edge-triggered.
+         */
+        reassert_nmi();
+}
+static int ignore_nmis;
+fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
+{
+        int cpu;
+        nmi_enter();
+        cpu = smp_processor_id();
+        ++nmi_count(cpu);
+        if (!ignore_nmis)
+                default_do_nmi(regs);
+        nmi_exit();
+}
+void stop_nmi(void)
+{
+        acpi_nmi_disable();
+        ignore_nmis++;
+}
+void restart_nmi(void)
+{
+        ignore_nmis--;
+        acpi_nmi_enable();
+}
+#ifdef CONFIG_KPROBES
+fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
+{
+        if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+                        == NOTIFY_STOP)
+                return;
+        /* This is an interrupt gate, because kprobes wants interrupts
+        disabled.  Normal trap handlers don't. */
+        restore_interrupts(regs);
+        do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
+}
+#endif
+/*
+ * Our handling of the processor debug registers is non-trivial.
+ * We do not clear them on entry and exit from the kernel. Therefore
+ * it is possible to get a watchpoint trap here from inside the kernel.
+ * However, the code in ./ptrace.c has ensured that the user can
+ * only set watchpoints on userspace addresses. Therefore the in-kernel
+ * watchpoint trap can only occur in code which is reading/writing
+ * from user space. Such code must not hold kernel locks (since it
+ * can equally take a page fault), therefore it is safe to call
+ * force_sig_info even though that claims and releases locks.
+ * 
+ * Code in ./signal.c ensures that the debug control register
+ * is restored before we deliver any signal, and therefore that
+ * user code runs with the correct debug control register even though
+ * we clear it here.
+ *
+ * Being careful here means that we don't have to be as careful in a
+ * lot of more complicated places (task switching can be a bit lazy
+ * about restoring all the debug state, and ptrace doesn't have to
+ * find every occurrence of the TF bit that could be saved away even
+ * by user code)
+ */
+fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
+{
+        unsigned int condition;
+        struct task_struct *tsk = current;
+        get_debugreg(condition, 6);
+        if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+                                        SIGTRAP) == NOTIFY_STOP)
+                return;
+        /* It's safe to allow irq's after DR6 has been saved */
+        if (regs->eflags & X86_EFLAGS_IF)
+                local_irq_enable();
+        /* Mask out spurious debug traps due to lazy DR7 setting */
+        if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
+                if (!tsk->thread.debugreg[7])
+                        goto clear_dr7;
+        }
+        if (regs->eflags & VM_MASK)
+                goto debug_vm86;
+        /* Save debug status register where ptrace can see it */
+        tsk->thread.debugreg[6] = condition;
+        /*
+         * Single-stepping through TF: make sure we ignore any events in
+         * kernel space (but re-enable TF when returning to user mode).
+         */
+        if (condition & DR_STEP) {
+                /*
+                 * We already checked v86 mode above, so we can
+                 * check for kernel mode by just checking the CPL
+                 * of CS.
+                 */
+                if (!user_mode(regs))
+                        goto clear_TF_reenable;
+        }
+        /* Ok, finally something we can handle */
+        send_sigtrap(tsk, regs, error_code);
+        /* Disable additional traps. They'll be re-enabled when
+         * the signal is delivered.
+         */
+clear_dr7:
+        set_debugreg(0, 7);
+        return;
+debug_vm86:
+        handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
+        return;
+clear_TF_reenable:
+        set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+        regs->eflags &= ~TF_MASK;
+        return;
+}
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+void math_error(void __user *eip)
+{
+        struct task_struct * task;
+        siginfo_t info;
+        unsigned short cwd, swd;
+        /*
+         * Save the info for the exception handler and clear the error.
+         */
+        task = current;
+        save_init_fpu(task);
+        task->thread.trap_no = 16;
+        task->thread.error_code = 0;
+        info.si_signo = SIGFPE;
+        info.si_errno = 0;
+        info.si_code = __SI_FAULT;
+        info.si_addr = eip;
+        /*
+         * (~cwd & swd) will mask out exceptions that are not set to unmasked
+         * status.  0x3f is the exception bits in these regs, 0x200 is the
+         * C1 reg you need in case of a stack fault, 0x040 is the stack
+         * fault bit.  We should only be taking one exception at a time,
+         * so if this combination doesn't produce any single exception,
+         * then we have a bad program that isn't syncronizing its FPU usage
+         * and it will suffer the consequences since we won't be able to
+         * fully reproduce the context of the exception
+         */
+        cwd = get_fpu_cwd(task);
+        swd = get_fpu_swd(task);
+        switch (swd & ~cwd & 0x3f) {
+                case 0x000: /* No unmasked exception */
+                        return;
+                default:    /* Multiple exceptions */
+                        break;
+                case 0x001: /* Invalid Op */
+                        /*
+                         * swd & 0x240 == 0x040: Stack Underflow
+                         * swd & 0x240 == 0x240: Stack Overflow
+                         * User must clear the SF bit (0x40) if set
+                         */
+                        info.si_code = FPE_FLTINV;
+                        break;
+                case 0x002: /* Denormalize */
+                case 0x010: /* Underflow */
+                        info.si_code = FPE_FLTUND;
+                        break;
+                case 0x004: /* Zero Divide */
+                        info.si_code = FPE_FLTDIV;
+                        break;
+                case 0x008: /* Overflow */
+                        info.si_code = FPE_FLTOVF;
+                        break;
+                case 0x020: /* Precision */
+                        info.si_code = FPE_FLTRES;
+                        break;
+        }
+        force_sig_info(SIGFPE, &info, task);
+}
+fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
+{
+        ignore_fpu_irq = 1;
+        math_error((void __user *)regs->eip);
+}
+static void simd_math_error(void __user *eip)
+{
+        struct task_struct * task;
+        siginfo_t info;
+        unsigned short mxcsr;
+        /*
+         * Save the info for the exception handler and clear the error.
+         */
+        task = current;
+        save_init_fpu(task);
+        task->thread.trap_no = 19;
+        task->thread.error_code = 0;
+        info.si_signo = SIGFPE;
+        info.si_errno = 0;
+        info.si_code = __SI_FAULT;
+        info.si_addr = eip;
+        /*
+         * The SIMD FPU exceptions are handled a little differently, as there
+         * is only a single status/control register.  Thus, to determine which
+         * unmasked exception was caught we must mask the exception mask bits
+         * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+         */
+        mxcsr = get_fpu_mxcsr(task);
+        switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
+                case 0x000:
+                default:
+                        break;
+                case 0x001: /* Invalid Op */
+                        info.si_code = FPE_FLTINV;
+                        break;
+                case 0x002: /* Denormalize */
+                case 0x010: /* Underflow */
+                        info.si_code = FPE_FLTUND;
+                        break;
+                case 0x004: /* Zero Divide */
+                        info.si_code = FPE_FLTDIV;
+                        break;
+                case 0x008: /* Overflow */
+                        info.si_code = FPE_FLTOVF;
+                        break;
+                case 0x020: /* Precision */
+                        info.si_code = FPE_FLTRES;
+                        break;
+        }
+        force_sig_info(SIGFPE, &info, task);
+}
+fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
+                                          long error_code)
+{
+        if (cpu_has_xmm) {
+                /* Handle SIMD FPU exceptions on PIII+ processors. */
+                ignore_fpu_irq = 1;
+                simd_math_error((void __user *)regs->eip);
+        } else {
+                /*
+                 * Handle strange cache flush from user space exception
+                 * in all other cases.  This is undocumented behaviour.
+                 */
+                if (regs->eflags & VM_MASK) {
+                        handle_vm86_fault((struct kernel_vm86_regs *)regs,
+                                          error_code);
+                        return;
+                }
+                current->thread.trap_no = 19;
+                current->thread.error_code = error_code;
+                die_if_kernel("cache flush denied", regs, error_code);
+                force_sig(SIGSEGV, current);
+        }
+}
+fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
+                                          long error_code)
+{
+#if 0
+        /* No need to warn about this any longer. */
+        printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
+#endif
+}
+fastcall unsigned long patch_espfix_desc(unsigned long uesp,
+                                          unsigned long kesp)
+{
+        struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
+        unsigned long base = (kesp - uesp) & -THREAD_SIZE;
+        unsigned long new_kesp = kesp - base;
+        unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
+        __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
+        /* Set up base for espfix segment */
+        desc &= 0x00f0ff0000000000ULL;
+        desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
+                ((((__u64)base) << 32) & 0xff00000000000000ULL) |
+                ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
+                (lim_pages & 0xffff);
+        *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
+        return new_kesp;
+}
+/*
+ *  'math_state_restore()' saves the current math information in the
+ * old math state array, and gets the new ones from the current task
+ *
+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
+ * Don't touch unless you *really* know how it works.
+ *
+ * Must be called with kernel preemption disabled (in this case,
+ * local interrupts are disabled at the call-site in entry.S).
+ */
+asmlinkage void math_state_restore(void)
+{
+        struct thread_info *thread = current_thread_info();
+        struct task_struct *tsk = thread->task;
+        clts();         /* Allow maths ops (or we recurse) */
+        if (!tsk_used_math(tsk))
+                init_fpu(tsk);
+        restore_fpu(tsk);
+        thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
+        tsk->fpu_counter++;
+}
+EXPORT_SYMBOL_GPL(math_state_restore);
+#ifndef CONFIG_MATH_EMULATION
+asmlinkage void math_emulate(long arg)
+{
+        printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
+        printk(KERN_EMERG "killing %s.\n",current->comm);
+        force_sig(SIGFPE,current);
+        schedule();
+}
+#endif /* CONFIG_MATH_EMULATION */
+#ifdef CONFIG_X86_F00F_BUG
+void __init trap_init_f00f_bug(void)
+{
+        __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
+        /*
+         * Update the IDT descriptor and reload the IDT so that
+         * it uses the read-only mapped virtual address.
+         */
+        idt_descr.address = fix_to_virt(FIX_F00F_IDT);
+        load_idt(&idt_descr);
+}
+#endif
+/*
+ * This needs to use 'idt_table' rather than 'idt', and
+ * thus use the _nonmapped_ version of the IDT, as the
+ * Pentium F0 0F bugfix can have resulted in the mapped
+ * IDT being write-protected.
+ */
+void set_intr_gate(unsigned int n, void *addr)
+{
+        _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
+}
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+        _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
+}
+static void __init set_trap_gate(unsigned int n, void *addr)
+{
+        _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
+}
+static void __init set_system_gate(unsigned int n, void *addr)
+{
+        _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
+}
+static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+        _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
+}
+void __init trap_init(void)
+{
+#ifdef CONFIG_EISA
+        void __iomem *p = ioremap(0x0FFFD9, 4);
+        if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
+                EISA_bus = 1;
+        }
+        iounmap(p);
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+        init_apic_mappings();
+#endif
+        set_trap_gate(0,&divide_error);
+        set_intr_gate(1,&debug);
+        set_intr_gate(2,&nmi);
+        set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
+        set_system_gate(4,&overflow);
+        set_trap_gate(5,&bounds);
+        set_trap_gate(6,&invalid_op);
+        set_trap_gate(7,&device_not_available);
+        set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
+        set_trap_gate(9,&coprocessor_segment_overrun);
+        set_trap_gate(10,&invalid_TSS);
+        set_trap_gate(11,&segment_not_present);
+        set_trap_gate(12,&stack_segment);
+        set_trap_gate(13,&general_protection);
+        set_intr_gate(14,&page_fault);
+        set_trap_gate(15,&spurious_interrupt_bug);
+        set_trap_gate(16,&coprocessor_error);
+        set_trap_gate(17,&alignment_check);
+#ifdef CONFIG_X86_MCE
+        set_trap_gate(18,&machine_check);
+#endif
+        set_trap_gate(19,&simd_coprocessor_error);
+        if (cpu_has_fxsr) {
+                /*
+                 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
+                 * Generates a compile-time "error: zero width for bit-field" if
+                 * the alignment is wrong.
+                 */
+                struct fxsrAlignAssert {
+                        int _:!(offsetof(struct task_struct,
+                                        thread.i387.fxsave) & 15);
+                };
+                printk(KERN_INFO "Enabling fast FPU save and restore... ");
+                set_in_cr4(X86_CR4_OSFXSR);
+                printk("done.\n");
+        }
+        if (cpu_has_xmm) {
+                printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
+                                "support... ");
+                set_in_cr4(X86_CR4_OSXMMEXCPT);
+                printk("done.\n");
+        }
+        set_system_gate(SYSCALL_VECTOR,&system_call);
+        /*
+         * Should be a barrier for any external CPU state.
+         */
+        cpu_init();
+        trap_init_hook();
+}
+static int __init kstack_setup(char *s)
+{
+        kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+        return 1;
+}
+__setup("kstack=", kstack_setup);
+static int __init code_bytes_setup(char *s)
+{
+        code_bytes = simple_strtoul(s, NULL, 0);
+        if (code_bytes > 8192)
+                code_bytes = 8192;
+        return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
new file mode 100644
index 000000000000..03888420775d
--- /dev/null
+++ b/arch/x86/kernel/traps_64.c
@@ -0,0 +1,1138 @@
+/*
+ *  linux/arch/x86-64/traps.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ *
+ *  Pentium III FXSR, SSE support
+ *      Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+/*
+ * 'Traps.c' handles hardware traps and faults after we have saved some
+ * state in 'entry.S'.
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/nmi.h>
+#include <linux/kprobes.h>
+#include <linux/kexec.h>
+#include <linux/unwind.h>
+#include <linux/uaccess.h>
+#include <linux/bug.h>
+#include <linux/kdebug.h>
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/debugreg.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/processor.h>
+#include <asm/unwind.h>
+#include <asm/smp.h>
+#include <asm/pgalloc.h>
+#include <asm/pda.h>
+#include <asm/proto.h>
+#include <asm/nmi.h>
+#include <asm/stacktrace.h>
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+asmlinkage void double_fault(void);
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void simd_coprocessor_error(void);
+asmlinkage void reserved(void);
+asmlinkage void alignment_check(void);
+asmlinkage void machine_check(void);
+asmlinkage void spurious_interrupt_bug(void);
+static inline void conditional_sti(struct pt_regs *regs)
+{
+        if (regs->eflags & X86_EFLAGS_IF)
+                local_irq_enable();
+}
+static inline void preempt_conditional_sti(struct pt_regs *regs)
+{
+        preempt_disable();
+        if (regs->eflags & X86_EFLAGS_IF)
+                local_irq_enable();
+}
+static inline void preempt_conditional_cli(struct pt_regs *regs)
+{
+        if (regs->eflags & X86_EFLAGS_IF)
+                local_irq_disable();
+        /* Make sure to not schedule here because we could be running
+           on an exception stack. */
+        preempt_enable_no_resched();
+}
+int kstack_depth_to_print = 12;
+#ifdef CONFIG_KALLSYMS
+void printk_address(unsigned long address)
+{
+        unsigned long offset = 0, symsize;
+        const char *symname;
+        char *modname;
+        char *delim = ":";
+        char namebuf[128];
+        symname = kallsyms_lookup(address, &symsize, &offset,
+                                        &modname, namebuf);
+        if (!symname) {
+                printk(" [<%016lx>]\n", address);
+                return;
+        }
+        if (!modname)
+                modname = delim = "";           
+        printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
+                address, delim, modname, delim, symname, offset, symsize);
+}
+#else
+void printk_address(unsigned long address)
+{
+        printk(" [<%016lx>]\n", address);
+}
+#endif
+static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
+                                        unsigned *usedp, char **idp)
+{
+        static char ids[][8] = {
+                [DEBUG_STACK - 1] = "#DB",
+                [NMI_STACK - 1] = "NMI",
+                [DOUBLEFAULT_STACK - 1] = "#DF",
+                [STACKFAULT_STACK - 1] = "#SS",
+                [MCE_STACK - 1] = "#MC",
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+                [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
+#endif
+        };
+        unsigned k;
+        /*
+         * Iterate over all exception stacks, and figure out whether
+         * 'stack' is in one of them:
+         */
+        for (k = 0; k < N_EXCEPTION_STACKS; k++) {
+                unsigned long end = per_cpu(orig_ist, cpu).ist[k];
+                /*
+                 * Is 'stack' above this exception frame's end?
+                 * If yes then skip to the next frame.
+                 */
+                if (stack >= end)
+                        continue;
+                /*
+                 * Is 'stack' above this exception frame's start address?
+                 * If yes then we found the right frame.
+                 */
+                if (stack >= end - EXCEPTION_STKSZ) {
+                        /*
+                         * Make sure we only iterate through an exception
+                         * stack once. If it comes up for the second time
+                         * then there's something wrong going on - just
+                         * break out and return NULL:
+                         */
+                        if (*usedp & (1U << k))
+                                break;
+                        *usedp |= 1U << k;
+                        *idp = ids[k];
+                        return (unsigned long *)end;
+                }
+                /*
+                 * If this is a debug stack, and if it has a larger size than
+                 * the usual exception stacks, then 'stack' might still
+                 * be within the lower portion of the debug stack:
+                 */
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+                if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
+                        unsigned j = N_EXCEPTION_STACKS - 1;
+                        /*
+                         * Black magic. A large debug stack is composed of
+                         * multiple exception stack entries, which we
+                         * iterate through now. Dont look:
+                         */
+                        do {
+                                ++j;
+                                end -= EXCEPTION_STKSZ;
+                                ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
+                        } while (stack < end - EXCEPTION_STKSZ);
+                        if (*usedp & (1U << j))
+                                break;
+                        *usedp |= 1U << j;
+                        *idp = ids[j];
+                        return (unsigned long *)end;
+                }
+#endif
+        }
+        return NULL;
+}
+#define MSG(txt) ops->warning(data, txt)
+/*
+ * x86-64 can have upto three kernel stacks: 
+ * process stack
+ * interrupt stack
+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
+ */
+static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
+{
+        void *t = (void *)tinfo;
+        return p > t && p < t + THREAD_SIZE - 3;
+}
+void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+                unsigned long *stack,
+                struct stacktrace_ops *ops, void *data)
+{
+        const unsigned cpu = get_cpu();
+        unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
+        unsigned used = 0;
+        struct thread_info *tinfo;
+        if (!tsk)
+                tsk = current;
+        if (!stack) {
+                unsigned long dummy;
+                stack = &dummy;
+                if (tsk && tsk != current)
+                        stack = (unsigned long *)tsk->thread.rsp;
+        }
+        /*
+         * Print function call entries within a stack. 'cond' is the
+         * "end of stackframe" condition, that the 'stack++'
+         * iteration will eventually trigger.
+         */
+#define HANDLE_STACK(cond) \
+        do while (cond) { \
+                unsigned long addr = *stack++; \
+                /* Use unlocked access here because except for NMIs     \
+                   we should be already protected against module unloads */ \
+                if (__kernel_text_address(addr)) { \
+                        /* \
+                         * If the address is either in the text segment of the \
+                         * kernel, or in the region which contains vmalloc'ed \
+                         * memory, it *may* be the address of a calling \
+                         * routine; if so, print it so that someone tracing \
+                         * down the cause of the crash will be able to figure \
+                         * out the call path that was taken. \
+                         */ \
+                        ops->address(data, addr);   \
+                } \
+        } while (0)
+        /*
+         * Print function call entries in all stacks, starting at the
+         * current stack address. If the stacks consist of nested
+         * exceptions
+         */
+        for (;;) {
+                char *id;
+                unsigned long *estack_end;
+                estack_end = in_exception_stack(cpu, (unsigned long)stack,
+                                                &used, &id);
+                if (estack_end) {
+                        if (ops->stack(data, id) < 0)
+                                break;
+                        HANDLE_STACK (stack < estack_end);
+                        ops->stack(data, "<EOE>");
+                        /*
+                         * We link to the next stack via the
+                         * second-to-last pointer (index -2 to end) in the
+                         * exception stack:
+                         */
+                        stack = (unsigned long *) estack_end[-2];
+                        continue;
+                }
+                if (irqstack_end) {
+                        unsigned long *irqstack;
+                        irqstack = irqstack_end -
+                                (IRQSTACKSIZE - 64) / sizeof(*irqstack);
+                        if (stack >= irqstack && stack < irqstack_end) {
+                                if (ops->stack(data, "IRQ") < 0)
+                                        break;
+                                HANDLE_STACK (stack < irqstack_end);
+                                /*
+                                 * We link to the next stack (which would be
+                                 * the process stack normally) the last
+                                 * pointer (index -1 to end) in the IRQ stack:
+                                 */
+                                stack = (unsigned long *) (irqstack_end[-1]);
+                                irqstack_end = NULL;
+                                ops->stack(data, "EOI");
+                                continue;
+                        }
+                }
+                break;
+        }
+        /*
+         * This handles the process stack:
+         */
+        tinfo = task_thread_info(tsk);
+        HANDLE_STACK (valid_stack_ptr(tinfo, stack));
+#undef HANDLE_STACK
+        put_cpu();
+}
+EXPORT_SYMBOL(dump_trace);
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+        print_symbol(msg, symbol);
+        printk("\n");
+}
+static void print_trace_warning(void *data, char *msg)
+{
+        printk("%s\n", msg);
+}
+static int print_trace_stack(void *data, char *name)
+{
+        printk(" <%s> ", name);
+        return 0;
+}
+static void print_trace_address(void *data, unsigned long addr)
+{
+        touch_nmi_watchdog();
+        printk_address(addr);
+}
+static struct stacktrace_ops print_trace_ops = {
+        .warning = print_trace_warning,
+        .warning_symbol = print_trace_warning_symbol,
+        .stack = print_trace_stack,
+        .address = print_trace_address,
+};
+void
+show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
+{
+        printk("\nCall Trace:\n");
+        dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
+        printk("\n");
+}
+static void
+_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
+{
+        unsigned long *stack;
+        int i;
+        const int cpu = smp_processor_id();
+        unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+        unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+        // debugging aid: "show_stack(NULL, NULL);" prints the
+        // back trace for this cpu.
+        if (rsp == NULL) {
+                if (tsk)
+                        rsp = (unsigned long *)tsk->thread.rsp;
+                else
+                        rsp = (unsigned long *)&rsp;
+        }
+        stack = rsp;
+        for(i=0; i < kstack_depth_to_print; i++) {
+                if (stack >= irqstack && stack <= irqstack_end) {
+                        if (stack == irqstack_end) {
+                                stack = (unsigned long *) (irqstack_end[-1]);
+                                printk(" <EOI> ");
+                        }
+                } else {
+                if (((long) stack & (THREAD_SIZE-1)) == 0)
+                        break;
+                }
+                if (i && ((i % 4) == 0))
+                        printk("\n");
+                printk(" %016lx", *stack++);
+                touch_nmi_watchdog();
+        }
+        show_trace(tsk, regs, rsp);
+}
+void show_stack(struct task_struct *tsk, unsigned long * rsp)
+{
+        _show_stack(tsk, NULL, rsp);
+}
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+        unsigned long dummy;
+        show_trace(NULL, NULL, &dummy);
+}
+EXPORT_SYMBOL(dump_stack);
+void show_registers(struct pt_regs *regs)
+{
+        int i;
+        int in_kernel = !user_mode(regs);
+        unsigned long rsp;
+        const int cpu = smp_processor_id();
+        struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+        rsp = regs->rsp;
+        printk("CPU %d ", cpu);
+        __show_regs(regs);
+        printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
+                cur->comm, cur->pid, task_thread_info(cur), cur);
+        /*
+         * When in-kernel, we also print out the stack and code at the
+         * time of the fault..
+         */
+        if (in_kernel) {
+                printk("Stack: ");
+                _show_stack(NULL, regs, (unsigned long*)rsp);
+                printk("\nCode: ");
+                if (regs->rip < PAGE_OFFSET)
+                        goto bad;
+                for (i=0; i<20; i++) {
+                        unsigned char c;
+                        if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
+bad:
+                                printk(" Bad RIP value.");
+                                break;
+                        }
+                        printk("%02x ", c);
+                }
+        }
+        printk("\n");
+}       
+int is_valid_bugaddr(unsigned long rip)
+{
+        unsigned short ud2;
+        if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
+                return 0;
+        return ud2 == 0x0b0f;
+}
+#ifdef CONFIG_BUG
+void out_of_line_bug(void)
+{ 
+        BUG(); 
+} 
+EXPORT_SYMBOL(out_of_line_bug);
+#endif
+static DEFINE_SPINLOCK(die_lock);
+static int die_owner = -1;
+static unsigned int die_nest_count;
+unsigned __kprobes long oops_begin(void)
+{
+        int cpu;
+        unsigned long flags;
+        oops_enter();
+        /* racy, but better than risking deadlock. */
+        local_irq_save(flags);
+        cpu = smp_processor_id();
+        if (!spin_trylock(&die_lock)) { 
+                if (cpu == die_owner) 
+                        /* nested oops. should stop eventually */;
+                else
+                        spin_lock(&die_lock);
+        }
+        die_nest_count++;
+        die_owner = cpu;
+        console_verbose();
+        bust_spinlocks(1);
+        return flags;
+}
+void __kprobes oops_end(unsigned long flags)
+{ 
+        die_owner = -1;
+        bust_spinlocks(0);
+        die_nest_count--;
+        if (die_nest_count)
+                /* We still own the lock */
+                local_irq_restore(flags);
+        else
+                /* Nest count reaches zero, release the lock. */
+                spin_unlock_irqrestore(&die_lock, flags);
+        if (panic_on_oops)
+                panic("Fatal exception");
+        oops_exit();
+}
+void __kprobes __die(const char * str, struct pt_regs * regs, long err)
+{
+        static int die_counter;
+        printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
+#ifdef CONFIG_PREEMPT
+        printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+        printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+        printk("DEBUG_PAGEALLOC");
+#endif
+        printk("\n");
+        notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
+        show_registers(regs);
+        add_taint(TAINT_DIE);
+        /* Executive summary in case the oops scrolled away */
+        printk(KERN_ALERT "RIP ");
+        printk_address(regs->rip); 
+        printk(" RSP <%016lx>\n", regs->rsp); 
+        if (kexec_should_crash(current))
+                crash_kexec(regs);
+}
+void die(const char * str, struct pt_regs * regs, long err)
+{
+        unsigned long flags = oops_begin();
+        if (!user_mode(regs))
+                report_bug(regs->rip, regs);
+        __die(str, regs, err);
+        oops_end(flags);
+        do_exit(SIGSEGV); 
+}
+void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
+{
+        unsigned long flags = oops_begin();
+        /*
+         * We are in trouble anyway, lets at least try
+         * to get a message out.
+         */
+        printk(str, smp_processor_id());
+        show_registers(regs);
+        if (kexec_should_crash(current))
+                crash_kexec(regs);
+        if (do_panic || panic_on_oops)
+                panic("Non maskable interrupt");
+        oops_end(flags);
+        nmi_exit();
+        local_irq_enable();
+        do_exit(SIGSEGV);
+}
+static void __kprobes do_trap(int trapnr, int signr, char *str,
+                              struct pt_regs * regs, long error_code,
+                              siginfo_t *info)
+{
+        struct task_struct *tsk = current;
+        if (user_mode(regs)) {
+                /*
+                 * We want error_code and trap_no set for userspace
+                 * faults and kernelspace faults which result in
+                 * die(), but not kernelspace faults which are fixed
+                 * up.  die() gives the process no chance to handle
+                 * the signal and notice the kernel fault information,
+                 * so that won't result in polluting the information
+                 * about previously queued, but not yet delivered,
+                 * faults.  See also do_general_protection below.
+                 */
+                tsk->thread.error_code = error_code;
+                tsk->thread.trap_no = trapnr;
+                if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+                    printk_ratelimit())
+                        printk(KERN_INFO
+                               "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
+                               tsk->comm, tsk->pid, str,
+                               regs->rip, regs->rsp, error_code); 
+                if (info)
+                        force_sig_info(signr, info, tsk);
+                else
+                        force_sig(signr, tsk);
+                return;
+        }
+        /* kernel trap */ 
+        {            
+                const struct exception_table_entry *fixup;
+                fixup = search_exception_tables(regs->rip);
+                if (fixup)
+                        regs->rip = fixup->fixup;
+                else {
+                        tsk->thread.error_code = error_code;
+                        tsk->thread.trap_no = trapnr;
+                        die(str, regs, error_code);
+                }
+                return;
+        }
+}
+#define DO_ERROR(trapnr, signr, str, name) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                                        == NOTIFY_STOP) \
+                return; \
+        conditional_sti(regs);                                          \
+        do_trap(trapnr, signr, str, regs, error_code, NULL); \
+}
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+        siginfo_t info; \
+        info.si_signo = signr; \
+        info.si_errno = 0; \
+        info.si_code = sicode; \
+        info.si_addr = (void __user *)siaddr; \
+        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                                        == NOTIFY_STOP) \
+                return; \
+        conditional_sti(regs);                                          \
+        do_trap(trapnr, signr, str, regs, error_code, &info); \
+}
+DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
+DO_ERROR( 4, SIGSEGV, "overflow", overflow)
+DO_ERROR( 5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
+DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
+DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR(18, SIGSEGV, "reserved", reserved)
+/* Runs on IST stack */
+asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
+{
+        if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
+                        12, SIGBUS) == NOTIFY_STOP)
+                return;
+        preempt_conditional_sti(regs);
+        do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
+        preempt_conditional_cli(regs);
+}
+asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
+{
+        static const char str[] = "double fault";
+        struct task_struct *tsk = current;
+        /* Return not checked because double check cannot be ignored */
+        notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
+        tsk->thread.error_code = error_code;
+        tsk->thread.trap_no = 8;
+        /* This is always a kernel trap and never fixable (and thus must
+           never return). */
+        for (;;)
+                die(str, regs, error_code);
+}
+asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
+                                                long error_code)
+{
+        struct task_struct *tsk = current;
+        conditional_sti(regs);
+        if (user_mode(regs)) {
+                tsk->thread.error_code = error_code;
+                tsk->thread.trap_no = 13;
+                if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+                    printk_ratelimit())
+                        printk(KERN_INFO
+                       "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
+                               tsk->comm, tsk->pid,
+                               regs->rip, regs->rsp, error_code); 
+                force_sig(SIGSEGV, tsk);
+                return;
+        } 
+        /* kernel gp */
+        {
+                const struct exception_table_entry *fixup;
+                fixup = search_exception_tables(regs->rip);
+                if (fixup) {
+                        regs->rip = fixup->fixup;
+                        return;
+                }
+                tsk->thread.error_code = error_code;
+                tsk->thread.trap_no = 13;
+                if (notify_die(DIE_GPF, "general protection fault", regs,
+                                        error_code, 13, SIGSEGV) == NOTIFY_STOP)
+                        return;
+                die("general protection fault", regs, error_code);
+        }
+}
+static __kprobes void
+mem_parity_error(unsigned char reason, struct pt_regs * regs)
+{
+        printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+                reason);
+        printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
+#if defined(CONFIG_EDAC)
+        if(edac_handler_set()) {
+                edac_atomic_assert_error();
+                return;
+        }
+#endif
+        if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+        printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+        /* Clear and disable the memory parity error line. */
+        reason = (reason & 0xf) | 4;
+        outb(reason, 0x61);
+}
+static __kprobes void
+io_check_error(unsigned char reason, struct pt_regs * regs)
+{
+        printk("NMI: IOCK error (debug interrupt?)\n");
+        show_registers(regs);
+        /* Re-enable the IOCK line, wait for a few seconds */
+        reason = (reason & 0xf) | 8;
+        outb(reason, 0x61);
+        mdelay(2000);
+        reason &= ~8;
+        outb(reason, 0x61);
+}
+static __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+{
+        printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+                reason);
+        printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+        if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+        printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+}
+/* Runs on IST stack. This code must keep interrupts off all the time.
+   Nested NMIs are prevented by the CPU. */
+asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
+{
+        unsigned char reason = 0;
+        int cpu;
+        cpu = smp_processor_id();
+        /* Only the BSP gets external NMIs from the system.  */
+        if (!cpu)
+                reason = get_nmi_reason();
+        if (!(reason & 0xc0)) {
+                if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
+                                                                == NOTIFY_STOP)
+                        return;
+                /*
+                 * Ok, so this is none of the documented NMI sources,
+                 * so it must be the NMI watchdog.
+                 */
+                if (nmi_watchdog_tick(regs,reason))
+                        return;
+                if (!do_nmi_callback(regs,cpu))
+                        unknown_nmi_error(reason, regs);
+                return;
+        }
+        if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+                return; 
+        /* AK: following checks seem to be broken on modern chipsets. FIXME */
+        if (reason & 0x80)
+                mem_parity_error(reason, regs);
+        if (reason & 0x40)
+                io_check_error(reason, regs);
+}
+/* runs on IST stack. */
+asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
+{
+        if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
+                return;
+        }
+        preempt_conditional_sti(regs);
+        do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
+        preempt_conditional_cli(regs);
+}
+/* Help handler running on IST stack to switch back to user stack
+   for scheduling or signal handling. The actual stack switch is done in
+   entry.S */
+asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
+{
+        struct pt_regs *regs = eregs;
+        /* Did already sync */
+        if (eregs == (struct pt_regs *)eregs->rsp)
+                ;
+        /* Exception from user space */
+        else if (user_mode(eregs))
+                regs = task_pt_regs(current);
+        /* Exception from kernel and interrupts are enabled. Move to
+           kernel process stack. */
+        else if (eregs->eflags & X86_EFLAGS_IF)
+                regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
+        if (eregs != regs)
+                *regs = *eregs;
+        return regs;
+}
+/* runs on IST stack. */
+asmlinkage void __kprobes do_debug(struct pt_regs * regs,
+                                   unsigned long error_code)
+{
+        unsigned long condition;
+        struct task_struct *tsk = current;
+        siginfo_t info;
+        get_debugreg(condition, 6);
+        if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+                                                SIGTRAP) == NOTIFY_STOP)
+                return;
+        preempt_conditional_sti(regs);
+        /* Mask out spurious debug traps due to lazy DR7 setting */
+        if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
+                if (!tsk->thread.debugreg7) { 
+                        goto clear_dr7;
+                }
+        }
+        tsk->thread.debugreg6 = condition;
+        /* Mask out spurious TF errors due to lazy TF clearing */
+        if (condition & DR_STEP) {
+                /*
+                 * The TF error should be masked out only if the current
+                 * process is not traced and if the TRAP flag has been set
+                 * previously by a tracing process (condition detected by
+                 * the PT_DTRACE flag); remember that the i386 TRAP flag
+                 * can be modified by the process itself in user mode,
+                 * allowing programs to debug themselves without the ptrace()
+                 * interface.
+                 */
+                if (!user_mode(regs))
+                       goto clear_TF_reenable;
+                /*
+                 * Was the TF flag set by a debugger? If so, clear it now,
+                 * so that register information is correct.
+                 */
+                if (tsk->ptrace & PT_DTRACE) {
+                        regs->eflags &= ~TF_MASK;
+                        tsk->ptrace &= ~PT_DTRACE;
+                }
+        }
+        /* Ok, finally something we can handle */
+        tsk->thread.trap_no = 1;
+        tsk->thread.error_code = error_code;
+        info.si_signo = SIGTRAP;
+        info.si_errno = 0;
+        info.si_code = TRAP_BRKPT;
+        info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
+        force_sig_info(SIGTRAP, &info, tsk);
+clear_dr7:
+        set_debugreg(0UL, 7);
+        preempt_conditional_cli(regs);
+        return;
+clear_TF_reenable:
+        set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+        regs->eflags &= ~TF_MASK;
+        preempt_conditional_cli(regs);
+}
+static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
+{
+        const struct exception_table_entry *fixup;
+        fixup = search_exception_tables(regs->rip);
+        if (fixup) {
+                regs->rip = fixup->fixup;
+                return 1;
+        }
+        notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
+        /* Illegal floating point operation in the kernel */
+        current->thread.trap_no = trapnr;
+        die(str, regs, 0);
+        return 0;
+}
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+asmlinkage void do_coprocessor_error(struct pt_regs *regs)
+{
+        void __user *rip = (void __user *)(regs->rip);
+        struct task_struct * task;
+        siginfo_t info;
+        unsigned short cwd, swd;
+        conditional_sti(regs);
+        if (!user_mode(regs) &&
+            kernel_math_error(regs, "kernel x87 math error", 16))
+                return;
+        /*
+         * Save the info for the exception handler and clear the error.
+         */
+        task = current;
+        save_init_fpu(task);
+        task->thread.trap_no = 16;
+        task->thread.error_code = 0;
+        info.si_signo = SIGFPE;
+        info.si_errno = 0;
+        info.si_code = __SI_FAULT;
+        info.si_addr = rip;
+        /*
+         * (~cwd & swd) will mask out exceptions that are not set to unmasked
+         * status.  0x3f is the exception bits in these regs, 0x200 is the
+         * C1 reg you need in case of a stack fault, 0x040 is the stack
+         * fault bit.  We should only be taking one exception at a time,
+         * so if this combination doesn't produce any single exception,
+         * then we have a bad program that isn't synchronizing its FPU usage
+         * and it will suffer the consequences since we won't be able to
+         * fully reproduce the context of the exception
+         */
+        cwd = get_fpu_cwd(task);
+        swd = get_fpu_swd(task);
+        switch (swd & ~cwd & 0x3f) {
+                case 0x000:
+                default:
+                        break;
+                case 0x001: /* Invalid Op */
+                        /*
+                         * swd & 0x240 == 0x040: Stack Underflow
+                         * swd & 0x240 == 0x240: Stack Overflow
+                         * User must clear the SF bit (0x40) if set
+                         */
+                        info.si_code = FPE_FLTINV;
+                        break;
+                case 0x002: /* Denormalize */
+                case 0x010: /* Underflow */
+                        info.si_code = FPE_FLTUND;
+                        break;
+                case 0x004: /* Zero Divide */
+                        info.si_code = FPE_FLTDIV;
+                        break;
+                case 0x008: /* Overflow */
+                        info.si_code = FPE_FLTOVF;
+                        break;
+                case 0x020: /* Precision */
+                        info.si_code = FPE_FLTRES;
+                        break;
+        }
+        force_sig_info(SIGFPE, &info, task);
+}
+asmlinkage void bad_intr(void)
+{
+        printk("bad interrupt"); 
+}
+asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
+{
+        void __user *rip = (void __user *)(regs->rip);
+        struct task_struct * task;
+        siginfo_t info;
+        unsigned short mxcsr;
+        conditional_sti(regs);
+        if (!user_mode(regs) &&
+                kernel_math_error(regs, "kernel simd math error", 19))
+                return;
+        /*
+         * Save the info for the exception handler and clear the error.
+         */
+        task = current;
+        save_init_fpu(task);
+        task->thread.trap_no = 19;
+        task->thread.error_code = 0;
+        info.si_signo = SIGFPE;
+        info.si_errno = 0;
+        info.si_code = __SI_FAULT;
+        info.si_addr = rip;
+        /*
+         * The SIMD FPU exceptions are handled a little differently, as there
+         * is only a single status/control register.  Thus, to determine which
+         * unmasked exception was caught we must mask the exception mask bits
+         * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+         */
+        mxcsr = get_fpu_mxcsr(task);
+        switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
+                case 0x000:
+                default:
+                        break;
+                case 0x001: /* Invalid Op */
+                        info.si_code = FPE_FLTINV;
+                        break;
+                case 0x002: /* Denormalize */
+                case 0x010: /* Underflow */
+                        info.si_code = FPE_FLTUND;
+                        break;
+                case 0x004: /* Zero Divide */
+                        info.si_code = FPE_FLTDIV;
+                        break;
+                case 0x008: /* Overflow */
+                        info.si_code = FPE_FLTOVF;
+                        break;
+                case 0x020: /* Precision */
+                        info.si_code = FPE_FLTRES;
+                        break;
+        }
+        force_sig_info(SIGFPE, &info, task);
+}
+asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
+{
+}
+asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+{
+}
+asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
+{
+}
+/*
+ *  'math_state_restore()' saves the current math information in the
+ * old math state array, and gets the new ones from the current task
+ *
+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
+ * Don't touch unless you *really* know how it works.
+ */
+asmlinkage void math_state_restore(void)
+{
+        struct task_struct *me = current;
+        clts();                 /* Allow maths ops (or we recurse) */
+        if (!used_math())
+                init_fpu(me);
+        restore_fpu_checking(&me->thread.i387.fxsave);
+        task_thread_info(me)->status |= TS_USEDFPU;
+        me->fpu_counter++;
+}
+void __init trap_init(void)
+{
+        set_intr_gate(0,&divide_error);
+        set_intr_gate_ist(1,&debug,DEBUG_STACK);
+        set_intr_gate_ist(2,&nmi,NMI_STACK);
+        set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */
+        set_system_gate(4,&overflow);   /* int4 can be called from all */
+        set_intr_gate(5,&bounds);
+        set_intr_gate(6,&invalid_op);
+        set_intr_gate(7,&device_not_available);
+        set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
+        set_intr_gate(9,&coprocessor_segment_overrun);
+        set_intr_gate(10,&invalid_TSS);
+        set_intr_gate(11,&segment_not_present);
+        set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK);
+        set_intr_gate(13,&general_protection);
+        set_intr_gate(14,&page_fault);
+        set_intr_gate(15,&spurious_interrupt_bug);
+        set_intr_gate(16,&coprocessor_error);
+        set_intr_gate(17,&alignment_check);
+#ifdef CONFIG_X86_MCE
+        set_intr_gate_ist(18,&machine_check, MCE_STACK); 
+#endif
+        set_intr_gate(19,&simd_coprocessor_error);
+#ifdef CONFIG_IA32_EMULATION
+        set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+#endif
+       
+        /*
+         * Should be a barrier for any external CPU state.
+         */
+        cpu_init();
+}
+static int __init oops_setup(char *s)
+{ 
+        if (!s)
+                return -EINVAL;
+        if (!strcmp(s, "panic"))
+                panic_on_oops = 1;
+        return 0;
+} 
+early_param("oops", oops_setup);
+static int __init kstack_setup(char *s)
+{
+        if (!s)
+                return -EINVAL;
+        kstack_depth_to_print = simple_strtoul(s,NULL,0);
+        return 0;
+}
+early_param("kstack", kstack_setup);
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
new file mode 100644
index 000000000000..3ed0ae8c918d
--- /dev/null
+++ b/arch/x86/kernel/tsc_32.c
@@ -0,0 +1,413 @@
+/*
+ * This code largely moved from arch/i386/kernel/timer/timer_tsc.c
+ * which was originally moved from arch/i386/kernel/time.c.
+ * See comments there for proper credits.
+ */
+#include <linux/sched.h>
+#include <linux/clocksource.h>
+#include <linux/workqueue.h>
+#include <linux/cpufreq.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <asm/delay.h>
+#include <asm/tsc.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+#include "mach_timer.h"
+static int tsc_enabled;
+/*
+ * On some systems the TSC frequency does not
+ * change with the cpu frequency. So we need
+ * an extra value to store the TSC freq
+ */
+unsigned int tsc_khz;
+EXPORT_SYMBOL_GPL(tsc_khz);
+int tsc_disable;
+#ifdef CONFIG_X86_TSC
+static int __init tsc_setup(char *str)
+{
+        printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
+                                "cannot disable TSC.\n");
+        return 1;
+}
+#else
+/*
+ * disable flag for tsc. Takes effect by clearing the TSC cpu flag
+ * in cpu/common.c
+ */
+static int __init tsc_setup(char *str)
+{
+        tsc_disable = 1;
+        return 1;
+}
+#endif
+__setup("notsc", tsc_setup);
+/*
+ * code to mark and check if the TSC is unstable
+ * due to cpufreq or due to unsynced TSCs
+ */
+static int tsc_unstable;
+int check_tsc_unstable(void)
+{
+        return tsc_unstable;
+}
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
+/* Accellerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *              ns = cycles / (freq / ns_per_sec)
+ *              ns = cycles * (ns_per_sec / freq)
+ *              ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *              ns = cycles * (10^6 / cpu_khz)
+ *
+ *      Then we use scaling math (suggested by george@mvista.com) to get:
+ *              ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *              ns = cycles * cyc2ns_scale / SC
+ *
+ *      And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better percision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+unsigned long cyc2ns_scale __read_mostly;
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
+{
+        cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
+}
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ */
+unsigned long long native_sched_clock(void)
+{
+        unsigned long long this_offset;
+        /*
+         * Fall back to jiffies if there's no TSC available:
+         * ( But note that we still use it if the TSC is marked
+         *   unstable. We do this because unlike Time Of Day,
+         *   the scheduler clock tolerates small errors and it's
+         *   very important for it to be as fast as the platform
+         *   can achive it. )
+         */
+        if (unlikely(!tsc_enabled && !tsc_unstable))
+                /* No locking but a rare wrong value is not a big deal: */
+                return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+        /* read the Time Stamp Counter: */
+        rdtscll(this_offset);
+        /* return the value in ns */
+        return cycles_2_ns(this_offset);
+}
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+        return paravirt_sched_clock();
+}
+#else
+unsigned long long sched_clock(void)
+        __attribute__((alias("native_sched_clock")));
+#endif
+unsigned long native_calculate_cpu_khz(void)
+{
+        unsigned long long start, end;
+        unsigned long count;
+        u64 delta64;
+        int i;
+        unsigned long flags;
+        local_irq_save(flags);
+        /* run 3 times to ensure the cache is warm */
+        for (i = 0; i < 3; i++) {
+                mach_prepare_counter();
+                rdtscll(start);
+                mach_countup(&count);
+                rdtscll(end);
+        }
+        /*
+         * Error: ECTCNEVERSET
+         * The CTC wasn't reliable: we got a hit on the very first read,
+         * or the CPU was so fast/slow that the quotient wouldn't fit in
+         * 32 bits..
+         */
+        if (count <= 1)
+                goto err;
+        delta64 = end - start;
+        /* cpu freq too fast: */
+        if (delta64 > (1ULL<<32))
+                goto err;
+        /* cpu freq too slow: */
+        if (delta64 <= CALIBRATE_TIME_MSEC)
+                goto err;
+        delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
+        do_div(delta64,CALIBRATE_TIME_MSEC);
+        local_irq_restore(flags);
+        return (unsigned long)delta64;
+err:
+        local_irq_restore(flags);
+        return 0;
+}
+int recalibrate_cpu_khz(void)
+{
+#ifndef CONFIG_SMP
+        unsigned long cpu_khz_old = cpu_khz;
+        if (cpu_has_tsc) {
+                cpu_khz = calculate_cpu_khz();
+                tsc_khz = cpu_khz;
+                cpu_data[0].loops_per_jiffy =
+                        cpufreq_scale(cpu_data[0].loops_per_jiffy,
+                                        cpu_khz_old, cpu_khz);
+                return 0;
+        } else
+                return -ENODEV;
+#else
+        return -ENODEV;
+#endif
+}
+EXPORT_SYMBOL(recalibrate_cpu_khz);
+#ifdef CONFIG_CPU_FREQ
+/*
+ * if the CPU frequency is scaled, TSC-based delays will need a different
+ * loops_per_jiffy value to function properly.
+ */
+static unsigned int ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+static unsigned long cpu_khz_ref = 0;
+static int
+time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
+{
+        struct cpufreq_freqs *freq = data;
+        if (!ref_freq) {
+                if (!freq->old){
+                        ref_freq = freq->new;
+                        return 0;
+                }
+                ref_freq = freq->old;
+                loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
+                cpu_khz_ref = cpu_khz;
+        }
+        if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+            (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+            (val == CPUFREQ_RESUMECHANGE)) {
+                if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+                        cpu_data[freq->cpu].loops_per_jiffy =
+                                cpufreq_scale(loops_per_jiffy_ref,
+                                                ref_freq, freq->new);
+                if (cpu_khz) {
+                        if (num_online_cpus() == 1)
+                                cpu_khz = cpufreq_scale(cpu_khz_ref,
+                                                ref_freq, freq->new);
+                        if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
+                                tsc_khz = cpu_khz;
+                                set_cyc2ns_scale(cpu_khz);
+                                /*
+                                 * TSC based sched_clock turns
+                                 * to junk w/ cpufreq
+                                 */
+                                mark_tsc_unstable("cpufreq changes");
+                        }
+                }
+        }
+        return 0;
+}
+static struct notifier_block time_cpufreq_notifier_block = {
+        .notifier_call  = time_cpufreq_notifier
+};
+static int __init cpufreq_tsc(void)
+{
+        return cpufreq_register_notifier(&time_cpufreq_notifier_block,
+                                         CPUFREQ_TRANSITION_NOTIFIER);
+}
+core_initcall(cpufreq_tsc);
+#endif
+/* clock source code */
+static unsigned long current_tsc_khz = 0;
+static cycle_t read_tsc(void)
+{
+        cycle_t ret;
+        rdtscll(ret);
+        return ret;
+}
+static struct clocksource clocksource_tsc = {
+        .name                   = "tsc",
+        .rating                 = 300,
+        .read                   = read_tsc,
+        .mask                   = CLOCKSOURCE_MASK(64),
+        .mult                   = 0, /* to be set */
+        .shift                  = 22,
+        .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
+                                  CLOCK_SOURCE_MUST_VERIFY,
+};
+void mark_tsc_unstable(char *reason)
+{
+        if (!tsc_unstable) {
+                tsc_unstable = 1;
+                tsc_enabled = 0;
+                printk("Marking TSC unstable due to: %s.\n", reason);
+                /* Can be called before registration */
+                if (clocksource_tsc.mult)
+                        clocksource_change_rating(&clocksource_tsc, 0);
+                else
+                        clocksource_tsc.rating = 0;
+        }
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
+{
+        printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
+                       d->ident);
+        tsc_unstable = 1;
+        return 0;
+}
+/* List of systems that have known TSC problems */
+static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
+        {
+         .callback = dmi_mark_tsc_unstable,
+         .ident = "IBM Thinkpad 380XD",
+         .matches = {
+                     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+                     DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
+                     },
+         },
+         {}
+};
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+__cpuinit int unsynchronized_tsc(void)
+{
+        if (!cpu_has_tsc || tsc_unstable)
+                return 1;
+        /*
+         * Intel systems are normally all synchronized.
+         * Exceptions must mark TSC as unstable:
+         */
+        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+                /* assume multi socket systems are not synchronized: */
+                if (num_possible_cpus() > 1)
+                        tsc_unstable = 1;
+        }
+        return tsc_unstable;
+}
+/*
+ * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
+ */
+#ifdef CONFIG_MGEODE_LX
+/* RTSC counts during suspend */
+#define RTSC_SUSP 0x100
+static void __init check_geode_tsc_reliable(void)
+{
+        unsigned long val;
+        rdmsrl(MSR_GEODE_BUSCONT_CONF0, val);
+        if ((val & RTSC_SUSP))
+                clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+#else
+static inline void check_geode_tsc_reliable(void) { }
+#endif
+void __init tsc_init(void)
+{
+        if (!cpu_has_tsc || tsc_disable)
+                goto out_no_tsc;
+        cpu_khz = calculate_cpu_khz();
+        tsc_khz = cpu_khz;
+        if (!cpu_khz)
+                goto out_no_tsc;
+        printk("Detected %lu.%03lu MHz processor.\n",
+                                (unsigned long)cpu_khz / 1000,
+                                (unsigned long)cpu_khz % 1000);
+        set_cyc2ns_scale(cpu_khz);
+        use_tsc_delay();
+        /* Check and install the TSC clocksource */
+        dmi_check_system(bad_tsc_dmi_table);
+        unsynchronized_tsc();
+        check_geode_tsc_reliable();
+        current_tsc_khz = tsc_khz;
+        clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+                                                        clocksource_tsc.shift);
+        /* lower the rating if we already know its unstable: */
+        if (check_tsc_unstable()) {
+                clocksource_tsc.rating = 0;
+                clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
+        } else
+                tsc_enabled = 1;
+        clocksource_register(&clocksource_tsc);
+        return;
+out_no_tsc:
+        /*
+         * Set the tsc_disable flag if there's no TSC support, this
+         * makes it a fast flag for the kernel to see whether it
+         * should be using the TSC.
+         */
+        tsc_disable = 1;
+}
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
new file mode 100644
index 000000000000..2a59bde663f2
--- /dev/null
+++ b/arch/x86/kernel/tsc_64.c
@@ -0,0 +1,207 @@
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/clocksource.h>
+#include <linux/time.h>
+#include <linux/acpi.h>
+#include <linux/cpufreq.h>
+#include <asm/timex.h>
+static int notsc __initdata = 0;
+unsigned int cpu_khz;           /* TSC clocks / usec, not used here */
+EXPORT_SYMBOL(cpu_khz);
+unsigned int tsc_khz;
+EXPORT_SYMBOL(tsc_khz);
+static unsigned int cyc2ns_scale __read_mostly;
+void set_cyc2ns_scale(unsigned long khz)
+{
+        cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz;
+}
+static unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+        return (cyc * cyc2ns_scale) >> NS_SCALE;
+}
+unsigned long long sched_clock(void)
+{
+        unsigned long a = 0;
+        /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
+         * which means it is not completely exact and may not be monotonous
+         * between CPUs. But the errors should be too small to matter for
+         * scheduling purposes.
+         */
+        rdtscll(a);
+        return cycles_2_ns(a);
+}
+static int tsc_unstable;
+inline int check_tsc_unstable(void)
+{
+        return tsc_unstable;
+}
+#ifdef CONFIG_CPU_FREQ
+/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+ * changes.
+ *
+ * RED-PEN: On SMP we assume all CPUs run with the same frequency.  It's
+ * not that important because current Opteron setups do not support
+ * scaling on SMP anyroads.
+ *
+ * Should fix up last_tsc too. Currently gettimeofday in the
+ * first tick after the change will be slightly wrong.
+ */
+static unsigned int  ref_freq;
+static unsigned long loops_per_jiffy_ref;
+static unsigned long tsc_khz_ref;
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+                                 void *data)
+{
+        struct cpufreq_freqs *freq = data;
+        unsigned long *lpj, dummy;
+        if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
+                return 0;
+        lpj = &dummy;
+        if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+#ifdef CONFIG_SMP
+                lpj = &cpu_data[freq->cpu].loops_per_jiffy;
+#else
+                lpj = &boot_cpu_data.loops_per_jiffy;
+#endif
+        if (!ref_freq) {
+                ref_freq = freq->old;
+                loops_per_jiffy_ref = *lpj;
+                tsc_khz_ref = tsc_khz;
+        }
+        if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+                (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+                (val == CPUFREQ_RESUMECHANGE)) {
+                *lpj =
+                cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+                tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+                if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+                        mark_tsc_unstable("cpufreq changes");
+        }
+        set_cyc2ns_scale(tsc_khz_ref);
+        return 0;
+}
+static struct notifier_block time_cpufreq_notifier_block = {
+        .notifier_call  = time_cpufreq_notifier
+};
+static int __init cpufreq_tsc(void)
+{
+        cpufreq_register_notifier(&time_cpufreq_notifier_block,
+                                  CPUFREQ_TRANSITION_NOTIFIER);
+        return 0;
+}
+core_initcall(cpufreq_tsc);
+#endif
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+__cpuinit int unsynchronized_tsc(void)
+{
+        if (tsc_unstable)
+                return 1;
+#ifdef CONFIG_SMP
+        if (apic_is_clustered_box())
+                return 1;
+#endif
+        /* Most intel systems have synchronized TSCs except for
+           multi node systems */
+        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+#ifdef CONFIG_ACPI
+                /* But TSC doesn't tick in C3 so don't use it there */
+                if (acpi_gbl_FADT.header.length > 0 &&
+                    acpi_gbl_FADT.C3latency < 1000)
+                        return 1;
+#endif
+                return 0;
+        }
+        /* Assume multi socket systems are not synchronized */
+        return num_present_cpus() > 1;
+}
+int __init notsc_setup(char *s)
+{
+        notsc = 1;
+        return 1;
+}
+__setup("notsc", notsc_setup);
+/* clock source code: */
+static cycle_t read_tsc(void)
+{
+        cycle_t ret = (cycle_t)get_cycles_sync();
+        return ret;
+}
+static cycle_t __vsyscall_fn vread_tsc(void)
+{
+        cycle_t ret = (cycle_t)get_cycles_sync();
+        return ret;
+}
+static struct clocksource clocksource_tsc = {
+        .name                   = "tsc",
+        .rating                 = 300,
+        .read                   = read_tsc,
+        .mask                   = CLOCKSOURCE_MASK(64),
+        .shift                  = 22,
+        .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
+                                  CLOCK_SOURCE_MUST_VERIFY,
+        .vread                  = vread_tsc,
+};
+void mark_tsc_unstable(char *reason)
+{
+        if (!tsc_unstable) {
+                tsc_unstable = 1;
+                printk("Marking TSC unstable due to %s\n", reason);
+                /* Change only the rating, when not registered */
+                if (clocksource_tsc.mult)
+                        clocksource_change_rating(&clocksource_tsc, 0);
+                else
+                        clocksource_tsc.rating = 0;
+        }
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+void __init init_tsc_clocksource(void)
+{
+        if (!notsc) {
+                clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
+                                                        clocksource_tsc.shift);
+                if (check_tsc_unstable())
+                        clocksource_tsc.rating = 0;
+                clocksource_register(&clocksource_tsc);
+        }
+}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
new file mode 100644
index 000000000000..355f5f506c81
--- /dev/null
+++ b/arch/x86/kernel/tsc_sync.c
@@ -0,0 +1,187 @@
+/*
+ * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization.
+ *
+ * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
+ *
+ * We check whether all boot CPUs have their TSC's synchronized,
+ * print a warning if not and turn off the TSC clock-source.
+ *
+ * The warp-check is point-to-point between two CPUs, the CPU
+ * initiating the bootup is the 'source CPU', the freshly booting
+ * CPU is the 'target CPU'.
+ *
+ * Only two CPUs may participate - they can enter in any order.
+ * ( The serial nature of the boot logic and the CPU hotplug lock
+ *   protects against more than 2 CPUs entering this code. )
+ */
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <asm/tsc.h>
+/*
+ * Entry/exit counters that make sure that both CPUs
+ * run the measurement code at once:
+ */
+static __cpuinitdata atomic_t start_count;
+static __cpuinitdata atomic_t stop_count;
+/*
+ * We use a raw spinlock in this exceptional case, because
+ * we want to have the fastest, inlined, non-debug version
+ * of a critical section, to be able to prove TSC time-warps:
+ */
+static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static __cpuinitdata cycles_t last_tsc;
+static __cpuinitdata cycles_t max_warp;
+static __cpuinitdata int nr_warps;
+/*
+ * TSC-warp measurement loop running on both CPUs:
+ */
+static __cpuinit void check_tsc_warp(void)
+{
+        cycles_t start, now, prev, end;
+        int i;
+        start = get_cycles_sync();
+        /*
+         * The measurement runs for 20 msecs:
+         */
+        end = start + tsc_khz * 20ULL;
+        now = start;
+        for (i = 0; ; i++) {
+                /*
+                 * We take the global lock, measure TSC, save the
+                 * previous TSC that was measured (possibly on
+                 * another CPU) and update the previous TSC timestamp.
+                 */
+                __raw_spin_lock(&sync_lock);
+                prev = last_tsc;
+                now = get_cycles_sync();
+                last_tsc = now;
+                __raw_spin_unlock(&sync_lock);
+                /*
+                 * Be nice every now and then (and also check whether
+                 * measurement is done [we also insert a 100 million
+                 * loops safety exit, so we dont lock up in case the
+                 * TSC readout is totally broken]):
+                 */
+                if (unlikely(!(i & 7))) {
+                        if (now > end || i > 100000000)
+                                break;
+                        cpu_relax();
+                        touch_nmi_watchdog();
+                }
+                /*
+                 * Outside the critical section we can now see whether
+                 * we saw a time-warp of the TSC going backwards:
+                 */
+                if (unlikely(prev > now)) {
+                        __raw_spin_lock(&sync_lock);
+                        max_warp = max(max_warp, prev - now);
+                        nr_warps++;
+                        __raw_spin_unlock(&sync_lock);
+                }
+        }
+}
+/*
+ * Source CPU calls into this - it waits for the freshly booted
+ * target CPU to arrive and then starts the measurement:
+ */
+void __cpuinit check_tsc_sync_source(int cpu)
+{
+        int cpus = 2;
+        /*
+         * No need to check if we already know that the TSC is not
+         * synchronized:
+         */
+        if (unsynchronized_tsc())
+                return;
+        printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
+                          smp_processor_id(), cpu);
+        /*
+         * Reset it - in case this is a second bootup:
+         */
+        atomic_set(&stop_count, 0);
+        /*
+         * Wait for the target to arrive:
+         */
+        while (atomic_read(&start_count) != cpus-1)
+                cpu_relax();
+        /*
+         * Trigger the target to continue into the measurement too:
+         */
+        atomic_inc(&start_count);
+        check_tsc_warp();
+        while (atomic_read(&stop_count) != cpus-1)
+                cpu_relax();
+        /*
+         * Reset it - just in case we boot another CPU later:
+         */
+        atomic_set(&start_count, 0);
+        if (nr_warps) {
+                printk("\n");
+                printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
+                                    " turning off TSC clock.\n", max_warp);
+                mark_tsc_unstable("check_tsc_sync_source failed");
+                nr_warps = 0;
+                max_warp = 0;
+                last_tsc = 0;
+        } else {
+                printk(" passed.\n");
+        }
+        /*
+         * Let the target continue with the bootup:
+         */
+        atomic_inc(&stop_count);
+}
+/*
+ * Freshly booted CPUs call into this:
+ */
+void __cpuinit check_tsc_sync_target(void)
+{
+        int cpus = 2;
+        if (unsynchronized_tsc())
+                return;
+        /*
+         * Register this CPU's participation and wait for the
+         * source CPU to start the measurement:
+         */
+        atomic_inc(&start_count);
+        while (atomic_read(&start_count) != cpus)
+                cpu_relax();
+        check_tsc_warp();
+        /*
+         * Ok, we are done:
+         */
+        atomic_inc(&stop_count);
+        /*
+         * Wait for the source CPU to print stuff:
+         */
+        while (atomic_read(&stop_count) != cpus)
+                cpu_relax();
+}
+#undef NR_LOOPS
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
new file mode 100644
index 000000000000..45b6f8a975a1
--- /dev/null
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -0,0 +1,105 @@
+/*
+ *
+ *      verify_cpu.S - Code for cpu long mode and SSE verification. This
+ *      code has been borrowed from boot/setup.S and was introduced by
+ *      Andi Kleen.
+ *
+ *      Copyright (c) 2007  Andi Kleen (ak@suse.de)
+ *      Copyright (c) 2007  Eric Biederman (ebiederm@xmission.com)
+ *      Copyright (c) 2007  Vivek Goyal (vgoyal@in.ibm.com)
+ *
+ *      This source code is licensed under the GNU General Public License,
+ *      Version 2.  See the file COPYING for more details.
+ *
+ *      This is a common code for verification whether CPU supports
+ *      long mode and SSE or not. It is not called directly instead this
+ *      file is included at various places and compiled in that context.
+ *      Following are the current usage.
+ *
+ *      This file is included by both 16bit and 32bit code.
+ *
+ *      arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
+ *      arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
+ *      arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
+ *      arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
+ *
+ *      verify_cpu, returns the status of cpu check in register %eax.
+ *              0: Success    1: Failure
+ *
+ *      The caller needs to check for the error code and take the action
+ *      appropriately. Either display a message or halt.
+ */
+#include <asm/cpufeature.h>
+verify_cpu:
+        pushfl                          # Save caller passed flags
+        pushl   $0                      # Kill any dangerous flags
+        popfl
+        pushfl                          # standard way to check for cpuid
+        popl    %eax
+        movl    %eax,%ebx
+        xorl    $0x200000,%eax
+        pushl   %eax
+        popfl
+        pushfl
+        popl    %eax
+        cmpl    %eax,%ebx
+        jz      verify_cpu_no_longmode  # cpu has no cpuid
+        movl    $0x0,%eax               # See if cpuid 1 is implemented
+        cpuid
+        cmpl    $0x1,%eax
+        jb      verify_cpu_no_longmode  # no cpuid 1
+        xor     %di,%di
+        cmpl    $0x68747541,%ebx        # AuthenticAMD
+        jnz     verify_cpu_noamd
+        cmpl    $0x69746e65,%edx
+        jnz     verify_cpu_noamd
+        cmpl    $0x444d4163,%ecx
+        jnz     verify_cpu_noamd
+        mov     $1,%di                  # cpu is from AMD
+verify_cpu_noamd:
+        movl    $0x1,%eax               # Does the cpu have what it takes
+        cpuid
+        andl    $REQUIRED_MASK0,%edx
+        xorl    $REQUIRED_MASK0,%edx
+        jnz     verify_cpu_no_longmode
+        movl    $0x80000000,%eax        # See if extended cpuid is implemented
+        cpuid
+        cmpl    $0x80000001,%eax
+        jb      verify_cpu_no_longmode  # no extended cpuid
+        movl    $0x80000001,%eax        # Does the cpu have what it takes
+        cpuid
+        andl    $REQUIRED_MASK1,%edx
+        xorl    $REQUIRED_MASK1,%edx
+        jnz     verify_cpu_no_longmode
+verify_cpu_sse_test:
+        movl    $1,%eax
+        cpuid
+        andl    $SSE_MASK,%edx
+        cmpl    $SSE_MASK,%edx
+        je      verify_cpu_sse_ok
+        test    %di,%di
+        jz      verify_cpu_no_longmode  # only try to force SSE on AMD
+        movl    $0xc0010015,%ecx        # HWCR
+        rdmsr
+        btr     $15,%eax                # enable SSE
+        wrmsr
+        xor     %di,%di                 # don't loop
+        jmp     verify_cpu_sse_test     # try again
+verify_cpu_no_longmode:
+        popfl                           # Restore caller passed flags
+        movl $1,%eax
+        ret
+verify_cpu_sse_ok:
+        popfl                           # Restore caller passed flags
+        xorl %eax, %eax
+        ret
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
new file mode 100644
index 000000000000..f2dcd1d27c0a
--- /dev/null
+++ b/arch/x86/kernel/vm86_32.c
@@ -0,0 +1,843 @@
+/*
+ *  linux/kernel/vm86.c
+ *
+ *  Copyright (C) 1994  Linus Torvalds
+ *
+ *  29 dec 2001 - Fixed oopses caused by unchecked access to the vm86
+ *                stack - Manfred Spraul <manfred@colorfullife.com>
+ *
+ *  22 mar 2002 - Manfred detected the stackfaults, but didn't handle
+ *                them correctly. Now the emulation will be in a
+ *                consistent state after stackfaults - Kasper Dupont
+ *                <kasperd@daimi.au.dk>
+ *
+ *  22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont
+ *                <kasperd@daimi.au.dk>
+ *
+ *  ?? ??? 2002 - Fixed premature returns from handle_vm86_fault
+ *                caused by Kasper Dupont's changes - Stas Sergeev
+ *
+ *   4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes.
+ *                Kasper Dupont <kasperd@daimi.au.dk>
+ *
+ *   9 apr 2002 - Changed syntax of macros in handle_vm86_fault.
+ *                Kasper Dupont <kasperd@daimi.au.dk>
+ *
+ *   9 apr 2002 - Changed stack access macros to jump to a label
+ *                instead of returning to userspace. This simplifies
+ *                do_int, and is needed by handle_vm6_fault. Kasper
+ *                Dupont <kasperd@daimi.au.dk>
+ *
+ */
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/audit.h>
+#include <linux/stddef.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/tlbflush.h>
+#include <asm/irq.h>
+/*
+ * Known problems:
+ *
+ * Interrupt handling is not guaranteed:
+ * - a real x86 will disable all interrupts for one instruction
+ *   after a "mov ss,xx" to make stack handling atomic even without
+ *   the 'lss' instruction. We can't guarantee this in v86 mode,
+ *   as the next instruction might result in a page fault or similar.
+ * - a real x86 will have interrupts disabled for one instruction
+ *   past the 'sti' that enables them. We don't bother with all the
+ *   details yet.
+ *
+ * Let's hope these problems do not actually matter for anything.
+ */
+#define KVM86   ((struct kernel_vm86_struct *)regs)
+#define VMPI    KVM86->vm86plus
+/*
+ * 8- and 16-bit register defines..
+ */
+#define AL(regs)        (((unsigned char *)&((regs)->pt.eax))[0])
+#define AH(regs)        (((unsigned char *)&((regs)->pt.eax))[1])
+#define IP(regs)        (*(unsigned short *)&((regs)->pt.eip))
+#define SP(regs)        (*(unsigned short *)&((regs)->pt.esp))
+/*
+ * virtual flags (16 and 32-bit versions)
+ */
+#define VFLAGS  (*(unsigned short *)&(current->thread.v86flags))
+#define VEFLAGS (current->thread.v86flags)
+#define set_flags(X,new,mask) \
+((X) = ((X) & ~(mask)) | ((new) & (mask)))
+#define SAFE_MASK       (0xDD5)
+#define RETURN_MASK     (0xDFF)
+/* convert kernel_vm86_regs to vm86_regs */
+static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
+                                  const struct kernel_vm86_regs *regs)
+{
+        int ret = 0;
+        /* kernel_vm86_regs is missing xgs, so copy everything up to
+           (but not including) orig_eax, and then rest including orig_eax. */
+        ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+        ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
+                            sizeof(struct kernel_vm86_regs) -
+                            offsetof(struct kernel_vm86_regs, pt.orig_eax));
+        return ret;
+}
+/* convert vm86_regs to kernel_vm86_regs */
+static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
+                                    const struct vm86_regs __user *user,
+                                    unsigned extra)
+{
+        int ret = 0;
+        /* copy eax-xfs inclusive */
+        ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
+        /* copy orig_eax-__gsh+extra */
+        ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
+                              sizeof(struct kernel_vm86_regs) -
+                              offsetof(struct kernel_vm86_regs, pt.orig_eax) +
+                              extra);
+        return ret;
+}
+struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
+struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
+{
+        struct tss_struct *tss;
+        struct pt_regs *ret;
+        unsigned long tmp;
+        /*
+         * This gets called from entry.S with interrupts disabled, but
+         * from process context. Enable interrupts here, before trying
+         * to access user space.
+         */
+        local_irq_enable();
+        if (!current->thread.vm86_info) {
+                printk("no vm86_info: BAD\n");
+                do_exit(SIGSEGV);
+        }
+        set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
+        tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
+        tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
+        if (tmp) {
+                printk("vm86: could not access userspace vm86_info\n");
+                do_exit(SIGSEGV);
+        }
+        tss = &per_cpu(init_tss, get_cpu());
+        current->thread.esp0 = current->thread.saved_esp0;
+        current->thread.sysenter_cs = __KERNEL_CS;
+        load_esp0(tss, &current->thread);
+        current->thread.saved_esp0 = 0;
+        put_cpu();
+        ret = KVM86->regs32;
+        ret->xfs = current->thread.saved_fs;
+        loadsegment(gs, current->thread.saved_gs);
+        return ret;
+}
+static void mark_screen_rdonly(struct mm_struct *mm)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        spinlock_t *ptl;
+        int i;
+        pgd = pgd_offset(mm, 0xA0000);
+        if (pgd_none_or_clear_bad(pgd))
+                goto out;
+        pud = pud_offset(pgd, 0xA0000);
+        if (pud_none_or_clear_bad(pud))
+                goto out;
+        pmd = pmd_offset(pud, 0xA0000);
+        if (pmd_none_or_clear_bad(pmd))
+                goto out;
+        pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
+        for (i = 0; i < 32; i++) {
+                if (pte_present(*pte))
+                        set_pte(pte, pte_wrprotect(*pte));
+                pte++;
+        }
+        pte_unmap_unlock(pte, ptl);
+out:
+        flush_tlb();
+}
+static int do_vm86_irq_handling(int subfunction, int irqnumber);
+static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
+asmlinkage int sys_vm86old(struct pt_regs regs)
+{
+        struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx;
+        struct kernel_vm86_struct info; /* declare this _on top_,
+                                         * this avoids wasting of stack space.
+                                         * This remains on the stack until we
+                                         * return to 32 bit user space.
+                                         */
+        struct task_struct *tsk;
+        int tmp, ret = -EPERM;
+        tsk = current;
+        if (tsk->thread.saved_esp0)
+                goto out;
+        tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
+                                       offsetof(struct kernel_vm86_struct, vm86plus) -
+                                       sizeof(info.regs));
+        ret = -EFAULT;
+        if (tmp)
+                goto out;
+        memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
+        info.regs32 = &regs;
+        tsk->thread.vm86_info = v86;
+        do_sys_vm86(&info, tsk);
+        ret = 0;        /* we never return here */
+out:
+        return ret;
+}
+asmlinkage int sys_vm86(struct pt_regs regs)
+{
+        struct kernel_vm86_struct info; /* declare this _on top_,
+                                         * this avoids wasting of stack space.
+                                         * This remains on the stack until we
+                                         * return to 32 bit user space.
+                                         */
+        struct task_struct *tsk;
+        int tmp, ret;
+        struct vm86plus_struct __user *v86;
+        tsk = current;
+        switch (regs.ebx) {
+                case VM86_REQUEST_IRQ:
+                case VM86_FREE_IRQ:
+                case VM86_GET_IRQ_BITS:
+                case VM86_GET_AND_RESET_IRQ:
+                        ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx);
+                        goto out;
+                case VM86_PLUS_INSTALL_CHECK:
+                        /* NOTE: on old vm86 stuff this will return the error
+                           from access_ok(), because the subfunction is
+                           interpreted as (invalid) address to vm86_struct.
+                           So the installation check works.
+                         */
+                        ret = 0;
+                        goto out;
+        }
+        /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
+        ret = -EPERM;
+        if (tsk->thread.saved_esp0)
+                goto out;
+        v86 = (struct vm86plus_struct __user *)regs.ecx;
+        tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
+                                       offsetof(struct kernel_vm86_struct, regs32) -
+                                       sizeof(info.regs));
+        ret = -EFAULT;
+        if (tmp)
+                goto out;
+        info.regs32 = &regs;
+        info.vm86plus.is_vm86pus = 1;
+        tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
+        do_sys_vm86(&info, tsk);
+        ret = 0;        /* we never return here */
+out:
+        return ret;
+}
+static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
+{
+        struct tss_struct *tss;
+/*
+ * make sure the vm86() system call doesn't try to do anything silly
+ */
+        info->regs.pt.xds = 0;
+        info->regs.pt.xes = 0;
+        info->regs.pt.xfs = 0;
+/* we are clearing gs later just before "jmp resume_userspace",
+ * because it is not saved/restored.
+ */
+/*
+ * The eflags register is also special: we cannot trust that the user
+ * has set it up safely, so this makes sure interrupt etc flags are
+ * inherited from protected mode.
+ */
+        VEFLAGS = info->regs.pt.eflags;
+        info->regs.pt.eflags &= SAFE_MASK;
+        info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
+        info->regs.pt.eflags |= VM_MASK;
+        switch (info->cpu_type) {
+                case CPU_286:
+                        tsk->thread.v86mask = 0;
+                        break;
+                case CPU_386:
+                        tsk->thread.v86mask = NT_MASK | IOPL_MASK;
+                        break;
+                case CPU_486:
+                        tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK;
+                        break;
+                default:
+                        tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK;
+                        break;
+        }
+/*
+ * Save old state, set default return value (%eax) to 0
+ */
+        info->regs32->eax = 0;
+        tsk->thread.saved_esp0 = tsk->thread.esp0;
+        tsk->thread.saved_fs = info->regs32->xfs;
+        savesegment(gs, tsk->thread.saved_gs);
+        tss = &per_cpu(init_tss, get_cpu());
+        tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
+        if (cpu_has_sep)
+                tsk->thread.sysenter_cs = 0;
+        load_esp0(tss, &tsk->thread);
+        put_cpu();
+        tsk->thread.screen_bitmap = info->screen_bitmap;
+        if (info->flags & VM86_SCREEN_BITMAP)
+                mark_screen_rdonly(tsk->mm);
+        /*call audit_syscall_exit since we do not exit via the normal paths */
+        if (unlikely(current->audit_context))
+                audit_syscall_exit(AUDITSC_RESULT(0), 0);
+        __asm__ __volatile__(
+                "movl %0,%%esp\n\t"
+                "movl %1,%%ebp\n\t"
+                "mov  %2, %%gs\n\t"
+                "jmp resume_userspace"
+                : /* no outputs */
+                :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
+        /* we never return here */
+}
+static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
+{
+        struct pt_regs * regs32;
+        regs32 = save_v86_state(regs16);
+        regs32->eax = retval;
+        __asm__ __volatile__("movl %0,%%esp\n\t"
+                "movl %1,%%ebp\n\t"
+                "jmp resume_userspace"
+                : : "r" (regs32), "r" (current_thread_info()));
+}
+static inline void set_IF(struct kernel_vm86_regs * regs)
+{
+        VEFLAGS |= VIF_MASK;
+        if (VEFLAGS & VIP_MASK)
+                return_to_32bit(regs, VM86_STI);
+}
+static inline void clear_IF(struct kernel_vm86_regs * regs)
+{
+        VEFLAGS &= ~VIF_MASK;
+}
+static inline void clear_TF(struct kernel_vm86_regs * regs)
+{
+        regs->pt.eflags &= ~TF_MASK;
+}
+static inline void clear_AC(struct kernel_vm86_regs * regs)
+{
+        regs->pt.eflags &= ~AC_MASK;
+}
+/* It is correct to call set_IF(regs) from the set_vflags_*
+ * functions. However someone forgot to call clear_IF(regs)
+ * in the opposite case.
+ * After the command sequence CLI PUSHF STI POPF you should
+ * end up with interrups disabled, but you ended up with
+ * interrupts enabled.
+ *  ( I was testing my own changes, but the only bug I
+ *    could find was in a function I had not changed. )
+ * [KD]
+ */
+static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
+{
+        set_flags(VEFLAGS, eflags, current->thread.v86mask);
+        set_flags(regs->pt.eflags, eflags, SAFE_MASK);
+        if (eflags & IF_MASK)
+                set_IF(regs);
+        else
+                clear_IF(regs);
+}
+static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
+{
+        set_flags(VFLAGS, flags, current->thread.v86mask);
+        set_flags(regs->pt.eflags, flags, SAFE_MASK);
+        if (flags & IF_MASK)
+                set_IF(regs);
+        else
+                clear_IF(regs);
+}
+static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
+{
+        unsigned long flags = regs->pt.eflags & RETURN_MASK;
+        if (VEFLAGS & VIF_MASK)
+                flags |= IF_MASK;
+        flags |= IOPL_MASK;
+        return flags | (VEFLAGS & current->thread.v86mask);
+}
+static inline int is_revectored(int nr, struct revectored_struct * bitmap)
+{
+        __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
+                :"=r" (nr)
+                :"m" (*bitmap),"r" (nr));
+        return nr;
+}
+#define val_byte(val, n) (((__u8 *)&val)[n])
+#define pushb(base, ptr, val, err_label) \
+        do { \
+                __u8 __val = val; \
+                ptr--; \
+                if (put_user(__val, base + ptr) < 0) \
+                        goto err_label; \
+        } while(0)
+#define pushw(base, ptr, val, err_label) \
+        do { \
+                __u16 __val = val; \
+                ptr--; \
+                if (put_user(val_byte(__val, 1), base + ptr) < 0) \
+                        goto err_label; \
+                ptr--; \
+                if (put_user(val_byte(__val, 0), base + ptr) < 0) \
+                        goto err_label; \
+        } while(0)
+#define pushl(base, ptr, val, err_label) \
+        do { \
+                __u32 __val = val; \
+                ptr--; \
+                if (put_user(val_byte(__val, 3), base + ptr) < 0) \
+                        goto err_label; \
+                ptr--; \
+                if (put_user(val_byte(__val, 2), base + ptr) < 0) \
+                        goto err_label; \
+                ptr--; \
+                if (put_user(val_byte(__val, 1), base + ptr) < 0) \
+                        goto err_label; \
+                ptr--; \
+                if (put_user(val_byte(__val, 0), base + ptr) < 0) \
+                        goto err_label; \
+        } while(0)
+#define popb(base, ptr, err_label) \
+        ({ \
+                __u8 __res; \
+                if (get_user(__res, base + ptr) < 0) \
+                        goto err_label; \
+                ptr++; \
+                __res; \
+        })
+#define popw(base, ptr, err_label) \
+        ({ \
+                __u16 __res; \
+                if (get_user(val_byte(__res, 0), base + ptr) < 0) \
+                        goto err_label; \
+                ptr++; \
+                if (get_user(val_byte(__res, 1), base + ptr) < 0) \
+                        goto err_label; \
+                ptr++; \
+                __res; \
+        })
+#define popl(base, ptr, err_label) \
+        ({ \
+                __u32 __res; \
+                if (get_user(val_byte(__res, 0), base + ptr) < 0) \
+                        goto err_label; \
+                ptr++; \
+                if (get_user(val_byte(__res, 1), base + ptr) < 0) \
+                        goto err_label; \
+                ptr++; \
+                if (get_user(val_byte(__res, 2), base + ptr) < 0) \
+                        goto err_label; \
+                ptr++; \
+                if (get_user(val_byte(__res, 3), base + ptr) < 0) \
+                        goto err_label; \
+                ptr++; \
+                __res; \
+        })
+/* There are so many possible reasons for this function to return
+ * VM86_INTx, so adding another doesn't bother me. We can expect
+ * userspace programs to be able to handle it. (Getting a problem
+ * in userspace is always better than an Oops anyway.) [KD]
+ */
+static void do_int(struct kernel_vm86_regs *regs, int i,
+    unsigned char __user * ssp, unsigned short sp)
+{
+        unsigned long __user *intr_ptr;
+        unsigned long segoffs;
+        if (regs->pt.xcs == BIOSSEG)
+                goto cannot_handle;
+        if (is_revectored(i, &KVM86->int_revectored))
+                goto cannot_handle;
+        if (i==0x21 && is_revectored(AH(regs),&KVM86->int21_revectored))
+                goto cannot_handle;
+        intr_ptr = (unsigned long __user *) (i << 2);
+        if (get_user(segoffs, intr_ptr))
+                goto cannot_handle;
+        if ((segoffs >> 16) == BIOSSEG)
+                goto cannot_handle;
+        pushw(ssp, sp, get_vflags(regs), cannot_handle);
+        pushw(ssp, sp, regs->pt.xcs, cannot_handle);
+        pushw(ssp, sp, IP(regs), cannot_handle);
+        regs->pt.xcs = segoffs >> 16;
+        SP(regs) -= 6;
+        IP(regs) = segoffs & 0xffff;
+        clear_TF(regs);
+        clear_IF(regs);
+        clear_AC(regs);
+        return;
+cannot_handle:
+        return_to_32bit(regs, VM86_INTx + (i << 8));
+}
+int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno)
+{
+        if (VMPI.is_vm86pus) {
+                if ( (trapno==3) || (trapno==1) )
+                        return_to_32bit(regs, VM86_TRAP + (trapno << 8));
+                do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
+                return 0;
+        }
+        if (trapno !=1)
+                return 1; /* we let this handle by the calling routine */
+        if (current->ptrace & PT_PTRACED) {
+                unsigned long flags;
+                spin_lock_irqsave(&current->sighand->siglock, flags);
+                sigdelset(&current->blocked, SIGTRAP);
+                recalc_sigpending();
+                spin_unlock_irqrestore(&current->sighand->siglock, flags);
+        }
+        send_sig(SIGTRAP, current, 1);
+        current->thread.trap_no = trapno;
+        current->thread.error_code = error_code;
+        return 0;
+}
+void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
+{
+        unsigned char opcode;
+        unsigned char __user *csp;
+        unsigned char __user *ssp;
+        unsigned short ip, sp, orig_flags;
+        int data32, pref_done;
+#define CHECK_IF_IN_TRAP \
+        if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
+                newflags |= TF_MASK
+#define VM86_FAULT_RETURN do { \
+        if (VMPI.force_return_for_pic  && (VEFLAGS & (IF_MASK | VIF_MASK))) \
+                return_to_32bit(regs, VM86_PICRETURN); \
+        if (orig_flags & TF_MASK) \
+                handle_vm86_trap(regs, 0, 1); \
+        return; } while (0)
+        orig_flags = *(unsigned short *)&regs->pt.eflags;
+        csp = (unsigned char __user *) (regs->pt.xcs << 4);
+        ssp = (unsigned char __user *) (regs->pt.xss << 4);
+        sp = SP(regs);
+        ip = IP(regs);
+        data32 = 0;
+        pref_done = 0;
+        do {
+                switch (opcode = popb(csp, ip, simulate_sigsegv)) {
+                        case 0x66:      /* 32-bit data */     data32=1; break;
+                        case 0x67:      /* 32-bit address */  break;
+                        case 0x2e:      /* CS */              break;
+                        case 0x3e:      /* DS */              break;
+                        case 0x26:      /* ES */              break;
+                        case 0x36:      /* SS */              break;
+                        case 0x65:      /* GS */              break;
+                        case 0x64:      /* FS */              break;
+                        case 0xf2:      /* repnz */       break;
+                        case 0xf3:      /* rep */             break;
+                        default: pref_done = 1;
+                }
+        } while (!pref_done);
+        switch (opcode) {
+        /* pushf */
+        case 0x9c:
+                if (data32) {
+                        pushl(ssp, sp, get_vflags(regs), simulate_sigsegv);
+                        SP(regs) -= 4;
+                } else {
+                        pushw(ssp, sp, get_vflags(regs), simulate_sigsegv);
+                        SP(regs) -= 2;
+                }
+                IP(regs) = ip;
+                VM86_FAULT_RETURN;
+        /* popf */
+        case 0x9d:
+                {
+                unsigned long newflags;
+                if (data32) {
+                        newflags=popl(ssp, sp, simulate_sigsegv);
+                        SP(regs) += 4;
+                } else {
+                        newflags = popw(ssp, sp, simulate_sigsegv);
+                        SP(regs) += 2;
+                }
+                IP(regs) = ip;
+                CHECK_IF_IN_TRAP;
+                if (data32) {
+                        set_vflags_long(newflags, regs);
+                } else {
+                        set_vflags_short(newflags, regs);
+                }
+                VM86_FAULT_RETURN;
+                }
+        /* int xx */
+        case 0xcd: {
+                int intno=popb(csp, ip, simulate_sigsegv);
+                IP(regs) = ip;
+                if (VMPI.vm86dbg_active) {
+                        if ( (1 << (intno &7)) & VMPI.vm86dbg_intxxtab[intno >> 3] )
+                                return_to_32bit(regs, VM86_INTx + (intno << 8));
+                }
+                do_int(regs, intno, ssp, sp);
+                return;
+        }
+        /* iret */
+        case 0xcf:
+                {
+                unsigned long newip;
+                unsigned long newcs;
+                unsigned long newflags;
+                if (data32) {
+                        newip=popl(ssp, sp, simulate_sigsegv);
+                        newcs=popl(ssp, sp, simulate_sigsegv);
+                        newflags=popl(ssp, sp, simulate_sigsegv);
+                        SP(regs) += 12;
+                } else {
+                        newip = popw(ssp, sp, simulate_sigsegv);
+                        newcs = popw(ssp, sp, simulate_sigsegv);
+                        newflags = popw(ssp, sp, simulate_sigsegv);
+                        SP(regs) += 6;
+                }
+                IP(regs) = newip;
+                regs->pt.xcs = newcs;
+                CHECK_IF_IN_TRAP;
+                if (data32) {
+                        set_vflags_long(newflags, regs);
+                } else {
+                        set_vflags_short(newflags, regs);
+                }
+                VM86_FAULT_RETURN;
+                }
+        /* cli */
+        case 0xfa:
+                IP(regs) = ip;
+                clear_IF(regs);
+                VM86_FAULT_RETURN;
+        /* sti */
+        /*
+         * Damn. This is incorrect: the 'sti' instruction should actually
+         * enable interrupts after the /next/ instruction. Not good.
+         *
+         * Probably needs some horsing around with the TF flag. Aiee..
+         */
+        case 0xfb:
+                IP(regs) = ip;
+                set_IF(regs);
+                VM86_FAULT_RETURN;
+        default:
+                return_to_32bit(regs, VM86_UNKNOWN);
+        }
+        return;
+simulate_sigsegv:
+        /* FIXME: After a long discussion with Stas we finally
+         *        agreed, that this is wrong. Here we should
+         *        really send a SIGSEGV to the user program.
+         *        But how do we create the correct context? We
+         *        are inside a general protection fault handler
+         *        and has just returned from a page fault handler.
+         *        The correct context for the signal handler
+         *        should be a mixture of the two, but how do we
+         *        get the information? [KD]
+         */
+        return_to_32bit(regs, VM86_UNKNOWN);
+}
+/* ---------------- vm86 special IRQ passing stuff ----------------- */
+#define VM86_IRQNAME            "vm86irq"
+static struct vm86_irqs {
+        struct task_struct *tsk;
+        int sig;
+} vm86_irqs[16];
+static DEFINE_SPINLOCK(irqbits_lock);
+static int irqbits;
+#define ALLOWED_SIGS ( 1 /* 0 = don't send a signal */ \
+        | (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO)  | (1 << SIGURG) \
+        | (1 << SIGUNUSED) )
+        
+static irqreturn_t irq_handler(int intno, void *dev_id)
+{
+        int irq_bit;
+        unsigned long flags;
+        spin_lock_irqsave(&irqbits_lock, flags);        
+        irq_bit = 1 << intno;
+        if ((irqbits & irq_bit) || ! vm86_irqs[intno].tsk)
+                goto out;
+        irqbits |= irq_bit;
+        if (vm86_irqs[intno].sig)
+                send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1);
+        /*
+         * IRQ will be re-enabled when user asks for the irq (whether
+         * polling or as a result of the signal)
+         */
+        disable_irq_nosync(intno);
+        spin_unlock_irqrestore(&irqbits_lock, flags);
+        return IRQ_HANDLED;
+out:
+        spin_unlock_irqrestore(&irqbits_lock, flags);   
+        return IRQ_NONE;
+}
+static inline void free_vm86_irq(int irqnumber)
+{
+        unsigned long flags;
+        free_irq(irqnumber, NULL);
+        vm86_irqs[irqnumber].tsk = NULL;
+        spin_lock_irqsave(&irqbits_lock, flags);        
+        irqbits &= ~(1 << irqnumber);
+        spin_unlock_irqrestore(&irqbits_lock, flags);   
+}
+void release_vm86_irqs(struct task_struct *task)
+{
+        int i;
+        for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++)
+            if (vm86_irqs[i].tsk == task)
+                free_vm86_irq(i);
+}
+static inline int get_and_reset_irq(int irqnumber)
+{
+        int bit;
+        unsigned long flags;
+        int ret = 0;
+        
+        if (invalid_vm86_irq(irqnumber)) return 0;
+        if (vm86_irqs[irqnumber].tsk != current) return 0;
+        spin_lock_irqsave(&irqbits_lock, flags);        
+        bit = irqbits & (1 << irqnumber);
+        irqbits &= ~bit;
+        if (bit) {
+                enable_irq(irqnumber);
+                ret = 1;
+        }
+        spin_unlock_irqrestore(&irqbits_lock, flags);   
+        return ret;
+}
+static int do_vm86_irq_handling(int subfunction, int irqnumber)
+{
+        int ret;
+        switch (subfunction) {
+                case VM86_GET_AND_RESET_IRQ: {
+                        return get_and_reset_irq(irqnumber);
+                }
+                case VM86_GET_IRQ_BITS: {
+                        return irqbits;
+                }
+                case VM86_REQUEST_IRQ: {
+                        int sig = irqnumber >> 8;
+                        int irq = irqnumber & 255;
+                        if (!capable(CAP_SYS_ADMIN)) return -EPERM;
+                        if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM;
+                        if (invalid_vm86_irq(irq)) return -EPERM;
+                        if (vm86_irqs[irq].tsk) return -EPERM;
+                        ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL);
+                        if (ret) return ret;
+                        vm86_irqs[irq].sig = sig;
+                        vm86_irqs[irq].tsk = current;
+                        return irq;
+                }
+                case  VM86_FREE_IRQ: {
+                        if (invalid_vm86_irq(irqnumber)) return -EPERM;
+                        if (!vm86_irqs[irqnumber].tsk) return 0;
+                        if (vm86_irqs[irqnumber].tsk != current) return -EPERM;
+                        free_vm86_irq(irqnumber);
+                        return 0;
+                }
+        }
+        return -EINVAL;
+}
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
new file mode 100644
index 000000000000..18673e0f193b
--- /dev/null
+++ b/arch/x86/kernel/vmi_32.c
@@ -0,0 +1,981 @@
+/*
+ * VMI specific paravirt-ops implementation
+ *
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to zach@vmware.com
+ *
+ */
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <asm/vmi.h>
+#include <asm/io.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/timer.h>
+#include <asm/vmi_time.h>
+#include <asm/kmap_types.h>
+/* Convenient for calling VMI functions indirectly in the ROM */
+typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
+typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
+#define call_vrom_func(rom,func) \
+   (((VROMFUNC *)(rom->func))())
+#define call_vrom_long_func(rom,func,arg) \
+   (((VROMLONGFUNC *)(rom->func)) (arg))
+static struct vrom_header *vmi_rom;
+static int disable_pge;
+static int disable_pse;
+static int disable_sep;
+static int disable_tsc;
+static int disable_mtrr;
+static int disable_noidle;
+static int disable_vmi_timer;
+/* Cached VMI operations */
+static struct {
+        void (*cpuid)(void /* non-c */);
+        void (*_set_ldt)(u32 selector);
+        void (*set_tr)(u32 selector);
+        void (*set_kernel_stack)(u32 selector, u32 esp0);
+        void (*allocate_page)(u32, u32, u32, u32, u32);
+        void (*release_page)(u32, u32);
+        void (*set_pte)(pte_t, pte_t *, unsigned);
+        void (*update_pte)(pte_t *, unsigned);
+        void (*set_linear_mapping)(int, void *, u32, u32);
+        void (*_flush_tlb)(int);
+        void (*set_initial_ap_state)(int, int);
+        void (*halt)(void);
+        void (*set_lazy_mode)(int mode);
+} vmi_ops;
+/* Cached VMI operations */
+struct vmi_timer_ops vmi_timer_ops;
+/*
+ * VMI patching routines.
+ */
+#define MNEM_CALL 0xe8
+#define MNEM_JMP  0xe9
+#define MNEM_RET  0xc3
+#define IRQ_PATCH_INT_MASK 0
+#define IRQ_PATCH_DISABLE  5
+static inline void patch_offset(void *insnbuf,
+                                unsigned long eip, unsigned long dest)
+{
+        *(unsigned long *)(insnbuf+1) = dest-eip-5;
+}
+static unsigned patch_internal(int call, unsigned len, void *insnbuf,
+                               unsigned long eip)
+{
+        u64 reloc;
+        struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
+        reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
+        switch(rel->type) {
+                case VMI_RELOCATION_CALL_REL:
+                        BUG_ON(len < 5);
+                        *(char *)insnbuf = MNEM_CALL;
+                        patch_offset(insnbuf, eip, (unsigned long)rel->eip);
+                        return 5;
+                case VMI_RELOCATION_JUMP_REL:
+                        BUG_ON(len < 5);
+                        *(char *)insnbuf = MNEM_JMP;
+                        patch_offset(insnbuf, eip, (unsigned long)rel->eip);
+                        return 5;
+                case VMI_RELOCATION_NOP:
+                        /* obliterate the whole thing */
+                        return 0;
+                case VMI_RELOCATION_NONE:
+                        /* leave native code in place */
+                        break;
+                default:
+                        BUG();
+        }
+        return len;
+}
+/*
+ * Apply patch if appropriate, return length of new instruction
+ * sequence.  The callee does nop padding for us.
+ */
+static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
+                          unsigned long eip, unsigned len)
+{
+        switch (type) {
+                case PARAVIRT_PATCH(irq_disable):
+                        return patch_internal(VMI_CALL_DisableInterrupts, len,
+                                              insns, eip);
+                case PARAVIRT_PATCH(irq_enable):
+                        return patch_internal(VMI_CALL_EnableInterrupts, len,
+                                              insns, eip);
+                case PARAVIRT_PATCH(restore_fl):
+                        return patch_internal(VMI_CALL_SetInterruptMask, len,
+                                              insns, eip);
+                case PARAVIRT_PATCH(save_fl):
+                        return patch_internal(VMI_CALL_GetInterruptMask, len,
+                                              insns, eip);
+                case PARAVIRT_PATCH(iret):
+                        return patch_internal(VMI_CALL_IRET, len, insns, eip);
+                case PARAVIRT_PATCH(irq_enable_sysexit):
+                        return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
+                default:
+                        break;
+        }
+        return len;
+}
+/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
+static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
+                               unsigned int *ecx, unsigned int *edx)
+{
+        int override = 0;
+        if (*eax == 1)
+                override = 1;
+        asm volatile ("call *%6"
+                      : "=a" (*eax),
+                        "=b" (*ebx),
+                        "=c" (*ecx),
+                        "=d" (*edx)
+                      : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
+        if (override) {
+                if (disable_pse)
+                        *edx &= ~X86_FEATURE_PSE;
+                if (disable_pge)
+                        *edx &= ~X86_FEATURE_PGE;
+                if (disable_sep)
+                        *edx &= ~X86_FEATURE_SEP;
+                if (disable_tsc)
+                        *edx &= ~X86_FEATURE_TSC;
+                if (disable_mtrr)
+                        *edx &= ~X86_FEATURE_MTRR;
+        }
+}
+static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
+{
+        if (gdt[nr].a != new->a || gdt[nr].b != new->b)
+                write_gdt_entry(gdt, nr, new->a, new->b);
+}
+static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+        struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+        vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
+        vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
+        vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
+}
+static void vmi_set_ldt(const void *addr, unsigned entries)
+{
+        unsigned cpu = smp_processor_id();
+        u32 low, high;
+        pack_descriptor(&low, &high, (unsigned long)addr,
+                        entries * sizeof(struct desc_struct) - 1,
+                        DESCTYPE_LDT, 0);
+        write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
+        vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
+}
+static void vmi_set_tr(void)
+{
+        vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
+}
+static void vmi_load_esp0(struct tss_struct *tss,
+                                   struct thread_struct *thread)
+{
+        tss->x86_tss.esp0 = thread->esp0;
+        /* This can only happen when SEP is enabled, no need to test "SEP"arately */
+        if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+                tss->x86_tss.ss1 = thread->sysenter_cs;
+                wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+        }
+        vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0);
+}
+static void vmi_flush_tlb_user(void)
+{
+        vmi_ops._flush_tlb(VMI_FLUSH_TLB);
+}
+static void vmi_flush_tlb_kernel(void)
+{
+        vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
+}
+/* Stub to do nothing at all; used for delays and unimplemented calls */
+static void vmi_nop(void)
+{
+}
+#ifdef CONFIG_DEBUG_PAGE_TYPE
+#ifdef CONFIG_X86_PAE
+#define MAX_BOOT_PTS (2048+4+1)
+#else
+#define MAX_BOOT_PTS (1024+1)
+#endif
+/*
+ * During boot, mem_map is not yet available in paging_init, so stash
+ * all the boot page allocations here.
+ */
+static struct {
+        u32 pfn;
+        int type;
+} boot_page_allocations[MAX_BOOT_PTS];
+static int num_boot_page_allocations;
+static int boot_allocations_applied;
+void vmi_apply_boot_page_allocations(void)
+{
+        int i;
+        BUG_ON(!mem_map);
+        for (i = 0; i < num_boot_page_allocations; i++) {
+                struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
+                page->type = boot_page_allocations[i].type;
+                page->type = boot_page_allocations[i].type &
+                                ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+        }
+        boot_allocations_applied = 1;
+}
+static void record_page_type(u32 pfn, int type)
+{
+        BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
+        boot_page_allocations[num_boot_page_allocations].pfn = pfn;
+        boot_page_allocations[num_boot_page_allocations].type = type;
+        num_boot_page_allocations++;
+}
+static void check_zeroed_page(u32 pfn, int type, struct page *page)
+{
+        u32 *ptr;
+        int i;
+        int limit = PAGE_SIZE / sizeof(int);
+        if (page_address(page))
+                ptr = (u32 *)page_address(page);
+        else
+                ptr = (u32 *)__va(pfn << PAGE_SHIFT);
+        /*
+         * When cloning the root in non-PAE mode, only the userspace
+         * pdes need to be zeroed.
+         */
+        if (type & VMI_PAGE_CLONE)
+                limit = USER_PTRS_PER_PGD;
+        for (i = 0; i < limit; i++)
+                BUG_ON(ptr[i]);
+}
+/*
+ * We stash the page type into struct page so we can verify the page
+ * types are used properly.
+ */
+static void vmi_set_page_type(u32 pfn, int type)
+{
+        /* PAE can have multiple roots per page - don't track */
+        if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
+                return;
+        if (boot_allocations_applied) {
+                struct page *page = pfn_to_page(pfn);
+                if (type != VMI_PAGE_NORMAL)
+                        BUG_ON(page->type);
+                else
+                        BUG_ON(page->type == VMI_PAGE_NORMAL);
+                page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+                if (type & VMI_PAGE_ZEROED)
+                        check_zeroed_page(pfn, type, page);
+        } else {
+                record_page_type(pfn, type);
+        }
+}
+static void vmi_check_page_type(u32 pfn, int type)
+{
+        /* PAE can have multiple roots per page - skip checks */
+        if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
+                return;
+        type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+        if (boot_allocations_applied) {
+                struct page *page = pfn_to_page(pfn);
+                BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
+                BUG_ON(type == VMI_PAGE_NORMAL && page->type);
+                BUG_ON((type & page->type) == 0);
+        }
+}
+#else
+#define vmi_set_page_type(p,t) do { } while (0)
+#define vmi_check_page_type(p,t) do { } while (0)
+#endif
+#ifdef CONFIG_HIGHPTE
+static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+        void *va = kmap_atomic(page, type);
+        /*
+         * Internally, the VMI ROM must map virtual addresses to physical
+         * addresses for processing MMU updates.  By the time MMU updates
+         * are issued, this information is typically already lost.
+         * Fortunately, the VMI provides a cache of mapping slots for active
+         * page tables.
+         *
+         * We use slot zero for the linear mapping of physical memory, and
+         * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
+         *
+         *  args:                 SLOT                 VA    COUNT PFN
+         */
+        BUG_ON(type != KM_PTE0 && type != KM_PTE1);
+        vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
+        return va;
+}
+#endif
+static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
+{
+        vmi_set_page_type(pfn, VMI_PAGE_L1);
+        vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
+}
+static void vmi_allocate_pd(u32 pfn)
+{
+        /*
+         * This call comes in very early, before mem_map is setup.
+         * It is called only for swapper_pg_dir, which already has
+         * data on it.
+         */
+        vmi_set_page_type(pfn, VMI_PAGE_L2);
+        vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
+}
+static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+{
+        vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
+        vmi_check_page_type(clonepfn, VMI_PAGE_L2);
+        vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
+}
+static void vmi_release_pt(u32 pfn)
+{
+        vmi_ops.release_page(pfn, VMI_PAGE_L1);
+        vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
+}
+static void vmi_release_pd(u32 pfn)
+{
+        vmi_ops.release_page(pfn, VMI_PAGE_L2);
+        vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
+}
+/*
+ * Helper macros for MMU update flags.  We can defer updates until a flush
+ * or page invalidation only if the update is to the current address space
+ * (otherwise, there is no flush).  We must check against init_mm, since
+ * this could be a kernel update, which usually passes init_mm, although
+ * sometimes this check can be skipped if we know the particular function
+ * is only called on user mode PTEs.  We could change the kernel to pass
+ * current->active_mm here, but in particular, I was unsure if changing
+ * mm/highmem.c to do this would still be correct on other architectures.
+ */
+#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm ||    \
+                                       (!mustbeuser && (mm) == &init_mm))
+#define vmi_flags_addr(mm, addr, level, user)                           \
+        ((level) | (is_current_as(mm, user) ?                           \
+                (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
+#define vmi_flags_addr_defer(mm, addr, level, user)                     \
+        ((level) | (is_current_as(mm, user) ?                           \
+                (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
+static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+        vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+        vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+        vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+        vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
+}
+static void vmi_set_pte(pte_t *ptep, pte_t pte)
+{
+        /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
+        vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
+        vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
+}
+static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+        vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+        vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+#ifdef CONFIG_X86_PAE
+        const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
+        vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
+#else
+        const pte_t pte = { pmdval.pud.pgd.pgd };
+        vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
+#endif
+        vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
+}
+#ifdef CONFIG_X86_PAE
+static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+        /*
+         * XXX This is called from set_pmd_pte, but at both PT
+         * and PD layers so the VMI_PAGE_PT flag is wrong.  But
+         * it is only called for large page mapping changes,
+         * the Xen backend, doesn't support large pages, and the
+         * ESX backend doesn't depend on the flag.
+         */
+        set_64bit((unsigned long long *)ptep,pte_val(pteval));
+        vmi_ops.update_pte(ptep, VMI_PAGE_PT);
+}
+static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+        vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+        vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
+}
+static void vmi_set_pud(pud_t *pudp, pud_t pudval)
+{
+        /* Um, eww */
+        const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
+        vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
+        vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
+}
+static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+        const pte_t pte = { 0 };
+        vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+        vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+static void vmi_pmd_clear(pmd_t *pmd)
+{
+        const pte_t pte = { 0 };
+        vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
+        vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
+}
+#endif
+#ifdef CONFIG_SMP
+static void __devinit
+vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
+                     unsigned long start_esp)
+{
+        struct vmi_ap_state ap;
+        /* Default everything to zero.  This is fine for most GPRs. */
+        memset(&ap, 0, sizeof(struct vmi_ap_state));
+        ap.gdtr_limit = GDT_SIZE - 1;
+        ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
+        ap.idtr_limit = IDT_ENTRIES * 8 - 1;
+        ap.idtr_base = (unsigned long) idt_table;
+        ap.ldtr = 0;
+        ap.cs = __KERNEL_CS;
+        ap.eip = (unsigned long) start_eip;
+        ap.ss = __KERNEL_DS;
+        ap.esp = (unsigned long) start_esp;
+        ap.ds = __USER_DS;
+        ap.es = __USER_DS;
+        ap.fs = __KERNEL_PERCPU;
+        ap.gs = 0;
+        ap.eflags = 0;
+#ifdef CONFIG_X86_PAE
+        /* efer should match BSP efer. */
+        if (cpu_has_nx) {
+                unsigned l, h;
+                rdmsr(MSR_EFER, l, h);
+                ap.efer = (unsigned long long) h << 32 | l;
+        }
+#endif
+        ap.cr3 = __pa(swapper_pg_dir);
+        /* Protected mode, paging, AM, WP, NE, MP. */
+        ap.cr0 = 0x80050023;
+        ap.cr4 = mmu_cr4_features;
+        vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
+}
+#endif
+static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
+{
+        static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
+        if (!vmi_ops.set_lazy_mode)
+                return;
+        /* Modes should never nest or overlap */
+        BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE ||
+                                             mode == PARAVIRT_LAZY_FLUSH));
+        if (mode == PARAVIRT_LAZY_FLUSH) {
+                vmi_ops.set_lazy_mode(0);
+                vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode));
+        } else {
+                vmi_ops.set_lazy_mode(mode);
+                __get_cpu_var(lazy_mode) = mode;
+        }
+}
+static inline int __init check_vmi_rom(struct vrom_header *rom)
+{
+        struct pci_header *pci;
+        struct pnp_header *pnp;
+        const char *manufacturer = "UNKNOWN";
+        const char *product = "UNKNOWN";
+        const char *license = "unspecified";
+        if (rom->rom_signature != 0xaa55)
+                return 0;
+        if (rom->vrom_signature != VMI_SIGNATURE)
+                return 0;
+        if (rom->api_version_maj != VMI_API_REV_MAJOR ||
+            rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
+                printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
+                                rom->api_version_maj,
+                                rom->api_version_min);
+                return 0;
+        }
+        /*
+         * Relying on the VMI_SIGNATURE field is not 100% safe, so check
+         * the PCI header and device type to make sure this is really a
+         * VMI device.
+         */
+        if (!rom->pci_header_offs) {
+                printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
+                return 0;
+        }
+        pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
+        if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
+            pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
+                /* Allow it to run... anyways, but warn */
+                printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
+        }
+        if (rom->pnp_header_offs) {
+                pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
+                if (pnp->manufacturer_offset)
+                        manufacturer = (const char *)rom+pnp->manufacturer_offset;
+                if (pnp->product_offset)
+                        product = (const char *)rom+pnp->product_offset;
+        }
+        if (rom->license_offs)
+                license = (char *)rom+rom->license_offs;
+        printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
+                manufacturer, product,
+                rom->api_version_maj, rom->api_version_min,
+                pci->rom_version_maj, pci->rom_version_min);
+        /* Don't allow BSD/MIT here for now because we don't want to end up
+           with any binary only shim layers */
+        if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
+                printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
+                        license);
+                return 0;
+        }
+        return 1;
+}
+/*
+ * Probe for the VMI option ROM
+ */
+static inline int __init probe_vmi_rom(void)
+{
+        unsigned long base;
+        /* VMI ROM is in option ROM area, check signature */
+        for (base = 0xC0000; base < 0xE0000; base += 2048) {
+                struct vrom_header *romstart;
+                romstart = (struct vrom_header *)isa_bus_to_virt(base);
+                if (check_vmi_rom(romstart)) {
+                        vmi_rom = romstart;
+                        return 1;
+                }
+        }
+        return 0;
+}
+/*
+ * VMI setup common to all processors
+ */
+void vmi_bringup(void)
+{
+        /* We must establish the lowmem mapping for MMU ops to work */
+        if (vmi_ops.set_linear_mapping)
+                vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
+}
+/*
+ * Return a pointer to a VMI function or NULL if unimplemented
+ */
+static void *vmi_get_function(int vmicall)
+{
+        u64 reloc;
+        const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+        reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
+        BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
+        if (rel->type == VMI_RELOCATION_CALL_REL)
+                return (void *)rel->eip;
+        else
+                return NULL;
+}
+/*
+ * Helper macro for making the VMI paravirt-ops fill code readable.
+ * For unimplemented operations, fall back to default, unless nop
+ * is returned by the ROM.
+ */
+#define para_fill(opname, vmicall)                              \
+do {                                                            \
+        reloc = call_vrom_long_func(vmi_rom, get_reloc,         \
+                                    VMI_CALL_##vmicall);        \
+        if (rel->type == VMI_RELOCATION_CALL_REL)               \
+                paravirt_ops.opname = (void *)rel->eip;         \
+        else if (rel->type == VMI_RELOCATION_NOP)               \
+                paravirt_ops.opname = (void *)vmi_nop;          \
+        else if (rel->type != VMI_RELOCATION_NONE)              \
+                printk(KERN_WARNING "VMI: Unknown relocation "  \
+                                    "type %d for " #vmicall"\n",\
+                                        rel->type);             \
+} while (0)
+/*
+ * Helper macro for making the VMI paravirt-ops fill code readable.
+ * For cached operations which do not match the VMI ROM ABI and must
+ * go through a tranlation stub.  Ignore NOPs, since it is not clear
+ * a NOP * VMI function corresponds to a NOP paravirt-op when the
+ * functions are not in 1-1 correspondence.
+ */
+#define para_wrap(opname, wrapper, cache, vmicall)              \
+do {                                                            \
+        reloc = call_vrom_long_func(vmi_rom, get_reloc,         \
+                                    VMI_CALL_##vmicall);        \
+        BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);           \
+        if (rel->type == VMI_RELOCATION_CALL_REL) {             \
+                paravirt_ops.opname = wrapper;                  \
+                vmi_ops.cache = (void *)rel->eip;               \
+        }                                                       \
+} while (0)
+/*
+ * Activate the VMI interface and switch into paravirtualized mode
+ */
+static inline int __init activate_vmi(void)
+{
+        short kernel_cs;
+        u64 reloc;
+        const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+        if (call_vrom_func(vmi_rom, vmi_init) != 0) {
+                printk(KERN_ERR "VMI ROM failed to initialize!");
+                return 0;
+        }
+        savesegment(cs, kernel_cs);
+        paravirt_ops.paravirt_enabled = 1;
+        paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
+        paravirt_ops.patch = vmi_patch;
+        paravirt_ops.name = "vmi";
+        /*
+         * Many of these operations are ABI compatible with VMI.
+         * This means we can fill in the paravirt-ops with direct
+         * pointers into the VMI ROM.  If the calling convention for
+         * these operations changes, this code needs to be updated.
+         *
+         * Exceptions
+         *  CPUID paravirt-op uses pointers, not the native ISA
+         *  halt has no VMI equivalent; all VMI halts are "safe"
+         *  no MSR support yet - just trap and emulate.  VMI uses the
+         *    same ABI as the native ISA, but Linux wants exceptions
+         *    from bogus MSR read / write handled
+         *  rdpmc is not yet used in Linux
+         */
+        /* CPUID is special, so very special it gets wrapped like a present */
+        para_wrap(cpuid, vmi_cpuid, cpuid, CPUID);
+        para_fill(clts, CLTS);
+        para_fill(get_debugreg, GetDR);
+        para_fill(set_debugreg, SetDR);
+        para_fill(read_cr0, GetCR0);
+        para_fill(read_cr2, GetCR2);
+        para_fill(read_cr3, GetCR3);
+        para_fill(read_cr4, GetCR4);
+        para_fill(write_cr0, SetCR0);
+        para_fill(write_cr2, SetCR2);
+        para_fill(write_cr3, SetCR3);
+        para_fill(write_cr4, SetCR4);
+        para_fill(save_fl, GetInterruptMask);
+        para_fill(restore_fl, SetInterruptMask);
+        para_fill(irq_disable, DisableInterrupts);
+        para_fill(irq_enable, EnableInterrupts);
+        para_fill(wbinvd, WBINVD);
+        para_fill(read_tsc, RDTSC);
+        /* The following we emulate with trap and emulate for now */
+        /* paravirt_ops.read_msr = vmi_rdmsr */
+        /* paravirt_ops.write_msr = vmi_wrmsr */
+        /* paravirt_ops.rdpmc = vmi_rdpmc */
+        /* TR interface doesn't pass TR value, wrap */
+        para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR);
+        /* LDT is special, too */
+        para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
+        para_fill(load_gdt, SetGDT);
+        para_fill(load_idt, SetIDT);
+        para_fill(store_gdt, GetGDT);
+        para_fill(store_idt, GetIDT);
+        para_fill(store_tr, GetTR);
+        paravirt_ops.load_tls = vmi_load_tls;
+        para_fill(write_ldt_entry, WriteLDTEntry);
+        para_fill(write_gdt_entry, WriteGDTEntry);
+        para_fill(write_idt_entry, WriteIDTEntry);
+        para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
+        para_fill(set_iopl_mask, SetIOPLMask);
+        para_fill(io_delay, IODelay);
+        para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
+        /* user and kernel flush are just handled with different flags to FlushTLB */
+        para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
+        para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
+        para_fill(flush_tlb_single, InvalPage);
+        /*
+         * Until a standard flag format can be agreed on, we need to
+         * implement these as wrappers in Linux.  Get the VMI ROM
+         * function pointers for the two backend calls.
+         */
+#ifdef CONFIG_X86_PAE
+        vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
+        vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
+#else
+        vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
+        vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
+#endif
+        if (vmi_ops.set_pte) {
+                paravirt_ops.set_pte = vmi_set_pte;
+                paravirt_ops.set_pte_at = vmi_set_pte_at;
+                paravirt_ops.set_pmd = vmi_set_pmd;
+#ifdef CONFIG_X86_PAE
+                paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
+                paravirt_ops.set_pte_present = vmi_set_pte_present;
+                paravirt_ops.set_pud = vmi_set_pud;
+                paravirt_ops.pte_clear = vmi_pte_clear;
+                paravirt_ops.pmd_clear = vmi_pmd_clear;
+#endif
+        }
+        if (vmi_ops.update_pte) {
+                paravirt_ops.pte_update = vmi_update_pte;
+                paravirt_ops.pte_update_defer = vmi_update_pte_defer;
+        }
+        vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
+        if (vmi_ops.allocate_page) {
+                paravirt_ops.alloc_pt = vmi_allocate_pt;
+                paravirt_ops.alloc_pd = vmi_allocate_pd;
+                paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+        }
+        vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
+        if (vmi_ops.release_page) {
+                paravirt_ops.release_pt = vmi_release_pt;
+                paravirt_ops.release_pd = vmi_release_pd;
+        }
+        /* Set linear is needed in all cases */
+        vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
+#ifdef CONFIG_HIGHPTE
+        if (vmi_ops.set_linear_mapping)
+                paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
+#endif
+        /*
+         * These MUST always be patched.  Don't support indirect jumps
+         * through these operations, as the VMI interface may use either
+         * a jump or a call to get to these operations, depending on
+         * the backend.  They are performance critical anyway, so requiring
+         * a patch is not a big problem.
+         */
+        paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
+        paravirt_ops.iret = (void *)0xbadbab0;
+#ifdef CONFIG_SMP
+        para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+        para_fill(apic_read, APICRead);
+        para_fill(apic_write, APICWrite);
+        para_fill(apic_write_atomic, APICWrite);
+#endif
+        /*
+         * Check for VMI timer functionality by probing for a cycle frequency method
+         */
+        reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
+        if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
+                vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
+                vmi_timer_ops.get_cycle_counter =
+                        vmi_get_function(VMI_CALL_GetCycleCounter);
+                vmi_timer_ops.get_wallclock =
+                        vmi_get_function(VMI_CALL_GetWallclockTime);
+                vmi_timer_ops.wallclock_updated =
+                        vmi_get_function(VMI_CALL_WallclockUpdated);
+                vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
+                vmi_timer_ops.cancel_alarm =
+                         vmi_get_function(VMI_CALL_CancelAlarm);
+                paravirt_ops.time_init = vmi_time_init;
+                paravirt_ops.get_wallclock = vmi_get_wallclock;
+                paravirt_ops.set_wallclock = vmi_set_wallclock;
+#ifdef CONFIG_X86_LOCAL_APIC
+                paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
+                paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
+#endif
+                paravirt_ops.sched_clock = vmi_sched_clock;
+                paravirt_ops.get_cpu_khz = vmi_cpu_khz;
+                /* We have true wallclock functions; disable CMOS clock sync */
+                no_sync_cmos_clock = 1;
+        } else {
+                disable_noidle = 1;
+                disable_vmi_timer = 1;
+        }
+        para_fill(safe_halt, Halt);
+        /*
+         * Alternative instruction rewriting doesn't happen soon enough
+         * to convert VMI_IRET to a call instead of a jump; so we have
+         * to do this before IRQs get reenabled.  Fortunately, it is
+         * idempotent.
+         */
+        apply_paravirt(__parainstructions, __parainstructions_end);
+        vmi_bringup();
+        return 1;
+}
+#undef para_fill
+void __init vmi_init(void)
+{
+        unsigned long flags;
+        if (!vmi_rom)
+                probe_vmi_rom();
+        else
+                check_vmi_rom(vmi_rom);
+        /* In case probing for or validating the ROM failed, basil */
+        if (!vmi_rom)
+                return;
+        reserve_top_address(-vmi_rom->virtual_top);
+        local_irq_save(flags);
+        activate_vmi();
+#ifdef CONFIG_X86_IO_APIC
+        /* This is virtual hardware; timer routing is wired correctly */
+        no_timer_check = 1;
+#endif
+        local_irq_restore(flags & X86_EFLAGS_IF);
+}
+static int __init parse_vmi(char *arg)
+{
+        if (!arg)
+                return -EINVAL;
+        if (!strcmp(arg, "disable_pge")) {
+                clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+                disable_pge = 1;
+        } else if (!strcmp(arg, "disable_pse")) {
+                clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+                disable_pse = 1;
+        } else if (!strcmp(arg, "disable_sep")) {
+                clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
+                disable_sep = 1;
+        } else if (!strcmp(arg, "disable_tsc")) {
+                clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
+                disable_tsc = 1;
+        } else if (!strcmp(arg, "disable_mtrr")) {
+                clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
+                disable_mtrr = 1;
+        } else if (!strcmp(arg, "disable_timer")) {
+                disable_vmi_timer = 1;
+                disable_noidle = 1;
+        } else if (!strcmp(arg, "disable_noidle"))
+                disable_noidle = 1;
+        return 0;
+}
+early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
new file mode 100644
index 000000000000..b1b5ab08b26e
--- /dev/null
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -0,0 +1,320 @@
+/*
+ * VMI paravirtual timer support routines.
+ *
+ * Copyright (C) 2007, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/cpumask.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <asm/vmi.h>
+#include <asm/vmi_time.h>
+#include <asm/arch_hooks.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/timer.h>
+#include <asm/i8253.h>
+#include <irq_vectors.h>
+#include "io_ports.h"
+#define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
+#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
+static DEFINE_PER_CPU(struct clock_event_device, local_events);
+static inline u32 vmi_counter(u32 flags)
+{
+        /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
+         * cycle counter. */
+        return flags & VMI_ALARM_COUNTER_MASK;
+}
+/* paravirt_ops.get_wallclock = vmi_get_wallclock */
+unsigned long vmi_get_wallclock(void)
+{
+        unsigned long long wallclock;
+        wallclock = vmi_timer_ops.get_wallclock(); // nsec
+        (void)do_div(wallclock, 1000000000);       // sec
+        return wallclock;
+}
+/* paravirt_ops.set_wallclock = vmi_set_wallclock */
+int vmi_set_wallclock(unsigned long now)
+{
+        return 0;
+}
+/* paravirt_ops.sched_clock = vmi_sched_clock */
+unsigned long long vmi_sched_clock(void)
+{
+        return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
+}
+/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
+unsigned long vmi_cpu_khz(void)
+{
+        unsigned long long khz;
+        khz = vmi_timer_ops.get_cycle_frequency();
+        (void)do_div(khz, 1000);
+        return khz;
+}
+static inline unsigned int vmi_get_timer_vector(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+        return FIRST_DEVICE_VECTOR;
+#else
+        return FIRST_EXTERNAL_VECTOR;
+#endif
+}
+/** vmi clockchip */
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned int startup_timer_irq(unsigned int irq)
+{
+        unsigned long val = apic_read(APIC_LVTT);
+        apic_write(APIC_LVTT, vmi_get_timer_vector());
+        return (val & APIC_SEND_PENDING);
+}
+static void mask_timer_irq(unsigned int irq)
+{
+        unsigned long val = apic_read(APIC_LVTT);
+        apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
+}
+static void unmask_timer_irq(unsigned int irq)
+{
+        unsigned long val = apic_read(APIC_LVTT);
+        apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
+}
+static void ack_timer_irq(unsigned int irq)
+{
+        ack_APIC_irq();
+}
+static struct irq_chip vmi_chip __read_mostly = {
+        .name           = "VMI-LOCAL",
+        .startup        = startup_timer_irq,
+        .mask           = mask_timer_irq,
+        .unmask         = unmask_timer_irq,
+        .ack            = ack_timer_irq
+};
+#endif
+/** vmi clockevent */
+#define VMI_ALARM_WIRED_IRQ0    0x00000000
+#define VMI_ALARM_WIRED_LVTT    0x00010000
+static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
+static inline int vmi_get_alarm_wiring(void)
+{
+        return vmi_wiring;
+}
+static void vmi_timer_set_mode(enum clock_event_mode mode,
+                               struct clock_event_device *evt)
+{
+        cycle_t now, cycles_per_hz;
+        BUG_ON(!irqs_disabled());
+        switch (mode) {
+        case CLOCK_EVT_MODE_ONESHOT:
+        case CLOCK_EVT_MODE_RESUME:
+                break;
+        case CLOCK_EVT_MODE_PERIODIC:
+                cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
+                (void)do_div(cycles_per_hz, HZ);
+                now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
+                vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
+                break;
+        case CLOCK_EVT_MODE_UNUSED:
+        case CLOCK_EVT_MODE_SHUTDOWN:
+                switch (evt->mode) {
+                case CLOCK_EVT_MODE_ONESHOT:
+                        vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
+                        break;
+                case CLOCK_EVT_MODE_PERIODIC:
+                        vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
+                        break;
+                default:
+                        break;
+                }
+                break;
+        default:
+                break;
+        }
+}
+static int vmi_timer_next_event(unsigned long delta,
+                                struct clock_event_device *evt)
+{
+        /* Unfortunately, set_next_event interface only passes relative
+         * expiry, but we want absolute expiry.  It'd be better if were
+         * were passed an aboslute expiry, since a bunch of time may
+         * have been stolen between the time the delta is computed and
+         * when we set the alarm below. */
+        cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
+        BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+        vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
+        return 0;
+}
+static struct clock_event_device vmi_clockevent = {
+        .name           = "vmi-timer",
+        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+        .shift          = 22,
+        .set_mode       = vmi_timer_set_mode,
+        .set_next_event = vmi_timer_next_event,
+        .rating         = 1000,
+        .irq            = 0,
+};
+static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
+{
+        struct clock_event_device *evt = &__get_cpu_var(local_events);
+        evt->event_handler(evt);
+        return IRQ_HANDLED;
+}
+static struct irqaction vmi_clock_action  = {
+        .name           = "vmi-timer",
+        .handler        = vmi_timer_interrupt,
+        .flags          = IRQF_DISABLED | IRQF_NOBALANCING,
+        .mask           = CPU_MASK_ALL,
+};
+static void __devinit vmi_time_init_clockevent(void)
+{
+        cycle_t cycles_per_msec;
+        struct clock_event_device *evt;
+        int cpu = smp_processor_id();
+        evt = &__get_cpu_var(local_events);
+        /* Use cycles_per_msec since div_sc params are 32-bits. */
+        cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
+        (void)do_div(cycles_per_msec, 1000);
+        memcpy(evt, &vmi_clockevent, sizeof(*evt));
+        /* Must pick .shift such that .mult fits in 32-bits.  Choosing
+         * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
+         * before overflow. */
+        evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
+        /* Upper bound is clockevent's use of ulong for cycle deltas. */
+        evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
+        evt->min_delta_ns = clockevent_delta2ns(1, evt);
+        evt->cpumask = cpumask_of_cpu(cpu);
+        printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
+               evt->name, evt->mult, evt->shift);
+        clockevents_register_device(evt);
+}
+void __init vmi_time_init(void)
+{
+        /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
+        outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
+        vmi_time_init_clockevent();
+        setup_irq(0, &vmi_clock_action);
+}
+#ifdef CONFIG_X86_LOCAL_APIC
+void __devinit vmi_time_bsp_init(void)
+{
+        /*
+         * On APIC systems, we want local timers to fire on each cpu.  We do
+         * this by programming LVTT to deliver timer events to the IRQ handler
+         * for IRQ-0, since we can't re-use the APIC local timer handler
+         * without interfering with that code.
+         */
+        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+        local_irq_disable();
+#ifdef CONFIG_X86_SMP
+        /*
+         * XXX handle_percpu_irq only defined for SMP; we need to switch over
+         * to using it, since this is a local interrupt, which each CPU must
+         * handle individually without locking out or dropping simultaneous
+         * local timers on other CPUs.  We also don't want to trigger the
+         * quirk workaround code for interrupts which gets invoked from
+         * handle_percpu_irq via eoi, so we use our own IRQ chip.
+         */
+        set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
+#else
+        set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
+#endif
+        vmi_wiring = VMI_ALARM_WIRED_LVTT;
+        apic_write(APIC_LVTT, vmi_get_timer_vector());
+        local_irq_enable();
+        clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+}
+void __devinit vmi_time_ap_init(void)
+{
+        vmi_time_init_clockevent();
+        apic_write(APIC_LVTT, vmi_get_timer_vector());
+}
+#endif
+/** vmi clocksource */
+static cycle_t read_real_cycles(void)
+{
+        return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
+}
+static struct clocksource clocksource_vmi = {
+        .name                   = "vmi-timer",
+        .rating                 = 450,
+        .read                   = read_real_cycles,
+        .mask                   = CLOCKSOURCE_MASK(64),
+        .mult                   = 0, /* to be set */
+        .shift                  = 22,
+        .flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+static int __init init_vmi_clocksource(void)
+{
+        cycle_t cycles_per_msec;
+        if (!vmi_timer_ops.get_cycle_frequency)
+                return 0;
+        /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
+        cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
+        (void)do_div(cycles_per_msec, 1000);
+        /* Note that clocksource.{mult, shift} converts in the opposite direction
+         * as clockevents.  */
+        clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
+                                                    clocksource_vmi.shift);
+        printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
+        return clocksource_register(&clocksource_vmi);
+}
+module_init(init_vmi_clocksource);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
new file mode 100644
index 000000000000..849ee611f013
--- /dev/null
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "vmlinux_32.lds.S"
+#else
+# include "vmlinux_64.lds.S"
+#endif
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
new file mode 100644
index 000000000000..7d72cce00529
--- /dev/null
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -0,0 +1,213 @@
+/* ld script to make i386 Linux kernel
+ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+/* Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+#define LOAD_OFFSET __PAGE_OFFSET
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/thread_info.h>
+#include <asm/page.h>
+#include <asm/cache.h>
+#include <asm/boot.h>
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+ENTRY(phys_startup_32)
+jiffies = jiffies_64;
+PHDRS {
+        text PT_LOAD FLAGS(5);  /* R_E */
+        data PT_LOAD FLAGS(7);  /* RWE */
+        note PT_NOTE FLAGS(0);  /* ___ */
+}
+SECTIONS
+{
+  . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+  phys_startup_32 = startup_32 - LOAD_OFFSET;
+  .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+        _text = .;                      /* Text and read-only data */
+        *(.text.head)
+  } :text = 0x9090
+  /* read-only */
+  .text : AT(ADDR(.text) - LOAD_OFFSET) {
+        TEXT_TEXT
+        SCHED_TEXT
+        LOCK_TEXT
+        KPROBES_TEXT
+        *(.fixup)
+        *(.gnu.warning)
+        _etext = .;                     /* End of text section */
+  } :text = 0x9090
+  . = ALIGN(16);                /* Exception table */
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+        __start___ex_table = .;
+         *(__ex_table)
+        __stop___ex_table = .;
+  }
+  NOTES :text :note
+  BUG_TABLE :text
+  . = ALIGN(4);
+  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
+        __tracedata_start = .;
+        *(.tracedata)
+        __tracedata_end = .;
+  }
+  RODATA
+  /* writeable */
+  . = ALIGN(4096);
+  .data : AT(ADDR(.data) - LOAD_OFFSET) {       /* Data */
+        DATA_DATA
+        CONSTRUCTORS
+        } :data
+  . = ALIGN(4096);
+  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+        __nosave_begin = .;
+        *(.data.nosave)
+        . = ALIGN(4096);
+        __nosave_end = .;
+  }
+  . = ALIGN(4096);
+  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+        *(.data.page_aligned)
+        *(.data.idt)
+  }
+  . = ALIGN(32);
+  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+        *(.data.cacheline_aligned)
+  }
+  /* rarely changed data like cpu maps */
+  . = ALIGN(32);
+  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+        *(.data.read_mostly)
+        _edata = .;             /* End of data section */
+  }
+  . = ALIGN(THREAD_SIZE);       /* init_task */
+  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+        *(.data.init_task)
+  }
+  /* might get freed after init */
+  . = ALIGN(4096);
+  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+        __smp_locks = .;
+        *(.smp_locks)
+        __smp_locks_end = .;
+  }
+  /* will be freed after init
+   * Following ALIGN() is required to make sure no other data falls on the
+   * same page where __smp_alt_end is pointing as that page might be freed
+   * after boot. Always make sure that ALIGN() directive is present after
+   * the section which contains __smp_alt_end.
+   */
+  . = ALIGN(4096);
+  /* will be freed after init */
+  . = ALIGN(4096);              /* Init code and data */
+  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+        __init_begin = .;
+        _sinittext = .;
+        *(.init.text)
+        _einittext = .;
+  }
+  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
+  . = ALIGN(16);
+  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+        __setup_start = .;
+        *(.init.setup)
+        __setup_end = .;
+   }
+  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+        __initcall_start = .;
+        INITCALLS
+        __initcall_end = .;
+  }
+  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+        __con_initcall_start = .;
+        *(.con_initcall.init)
+        __con_initcall_end = .;
+  }
+  SECURITY_INIT
+  . = ALIGN(4);
+  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+        __alt_instructions = .;
+        *(.altinstructions)
+        __alt_instructions_end = .;
+  }
+  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+        *(.altinstr_replacement)
+  }
+  . = ALIGN(4);
+  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+        __parainstructions = .;
+        *(.parainstructions)
+        __parainstructions_end = .;
+  }
+  /* .exit.text is discard at runtime, not link time, to deal with references
+     from .altinstructions and .eh_frame */
+  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
+  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
+#if defined(CONFIG_BLK_DEV_INITRD)
+  . = ALIGN(4096);
+  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+        __initramfs_start = .;
+        *(.init.ramfs)
+        __initramfs_end = .;
+  }
+#endif
+  . = ALIGN(4096);
+  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
+        __per_cpu_start = .;
+        *(.data.percpu)
+        *(.data.percpu.shared_aligned)
+        __per_cpu_end = .;
+  }
+  . = ALIGN(4096);
+  /* freed after init ends here */
+        
+  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+        __init_end = .;
+        __bss_start = .;                /* BSS */
+        *(.bss.page_aligned)
+        *(.bss)
+        . = ALIGN(4);
+        __bss_stop = .;
+        _end = . ;
+        /* This is where the kernel creates the early boot page tables */
+        . = ALIGN(4096);
+        pg0 = . ;
+  }
+  /* Sections to be discarded */
+  /DISCARD/ : {
+        *(.exitcall.exit)
+        }
+  STABS_DEBUG
+  DWARF_DEBUG
+}
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
new file mode 100644
index 000000000000..ba8ea97abd21
--- /dev/null
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -0,0 +1,235 @@
+/* ld script to make x86-64 Linux kernel
+ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
+ */
+#define LOAD_OFFSET __START_KERNEL_map
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/page.h>
+#undef i386     /* in case the preprocessor is a 32bit one */
+OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(phys_startup_64)
+jiffies_64 = jiffies;
+_proxy_pda = 1;
+PHDRS {
+        text PT_LOAD FLAGS(5);  /* R_E */
+        data PT_LOAD FLAGS(7);  /* RWE */
+        user PT_LOAD FLAGS(7);  /* RWE */
+        data.init PT_LOAD FLAGS(7);     /* RWE */
+        note PT_NOTE FLAGS(4);  /* R__ */
+}
+SECTIONS
+{
+  . = __START_KERNEL;
+  phys_startup_64 = startup_64 - LOAD_OFFSET;
+  _text = .;                    /* Text and read-only data */
+  .text :  AT(ADDR(.text) - LOAD_OFFSET) {
+        /* First the code that has to be first for bootstrapping */
+        *(.text.head)
+        _stext = .;
+        /* Then the rest */
+        TEXT_TEXT
+        SCHED_TEXT
+        LOCK_TEXT
+        KPROBES_TEXT
+        *(.fixup)
+        *(.gnu.warning)
+        } :text = 0x9090
+                                /* out-of-line lock text */
+  .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
+  _etext = .;                   /* End of text section */
+  . = ALIGN(16);                /* Exception table */
+  __start___ex_table = .;
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
+  __stop___ex_table = .;
+  NOTES :text :note
+  BUG_TABLE :text
+  RODATA
+  . = ALIGN(4);
+  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
+        __tracedata_start = .;
+        *(.tracedata)
+        __tracedata_end = .;
+  }
+  . = ALIGN(PAGE_SIZE);        /* Align data segment to page size boundary */
+                                /* Data */
+  .data : AT(ADDR(.data) - LOAD_OFFSET) {
+        DATA_DATA
+        CONSTRUCTORS
+        } :data
+  _edata = .;                   /* End of data section */
+  . = ALIGN(PAGE_SIZE);
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+        *(.data.cacheline_aligned)
+  }
+  . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
+  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+        *(.data.read_mostly)
+  }
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+  . = VSYSCALL_ADDR;
+  .vsyscall_0 :  AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
+  __vsyscall_0 = VSYSCALL_VIRT_ADDR;
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
+                { *(.vsyscall_gtod_data) }
+  vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+  .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
+                { *(.vsyscall_clock) }
+  vsyscall_clock = VVIRT(.vsyscall_clock);
+  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
+                { *(.vsyscall_1) }
+  .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
+                { *(.vsyscall_2) }
+  .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
+  vgetcpu_mode = VVIRT(.vgetcpu_mode);
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
+  jiffies = VVIRT(.jiffies);
+  .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
+                { *(.vsyscall_3) }
+  . = VSYSCALL_VIRT_ADDR + 4096;
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
+  . = ALIGN(8192);              /* init_task */
+  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+        *(.data.init_task)
+  }:data.init
+  . = ALIGN(4096);
+  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+        *(.data.page_aligned)
+  }
+  /* might get freed after init */
+  . = ALIGN(4096);
+  __smp_alt_begin = .;
+  __smp_locks = .;
+  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+        *(.smp_locks)
+  }
+  __smp_locks_end = .;
+  . = ALIGN(4096);
+  __smp_alt_end = .;
+  . = ALIGN(4096);              /* Init code and data */
+  __init_begin = .;
+  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+        _sinittext = .;
+        *(.init.text)
+        _einittext = .;
+  }
+  __initdata_begin = .;
+  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
+  __initdata_end = .;
+  . = ALIGN(16);
+  __setup_start = .;
+  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
+  __setup_end = .;
+  __initcall_start = .;
+  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+        INITCALLS
+  }
+  __initcall_end = .;
+  __con_initcall_start = .;
+  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+        *(.con_initcall.init)
+  }
+  __con_initcall_end = .;
+  SECURITY_INIT
+  . = ALIGN(8);
+  __alt_instructions = .;
+  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+        *(.altinstructions)
+  }
+  __alt_instructions_end = .; 
+  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+        *(.altinstr_replacement)
+  }
+  /* .exit.text is discard at runtime, not link time, to deal with references
+     from .altinstructions and .eh_frame */
+  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
+  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
+/* vdso blob that is mapped into user space */
+  vdso_start = . ;
+  .vdso  : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) }
+  . = ALIGN(4096);
+  vdso_end = .;
+#ifdef CONFIG_BLK_DEV_INITRD
+  . = ALIGN(4096);
+  __initramfs_start = .;
+  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
+  __initramfs_end = .;
+#endif
+  PERCPU(4096)
+  . = ALIGN(4096);
+  __init_end = .;
+  . = ALIGN(4096);
+  __nosave_begin = .;
+  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
+  . = ALIGN(4096);
+  __nosave_end = .;
+  __bss_start = .;              /* BSS */
+  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+        *(.bss.page_aligned)
+        *(.bss)
+        }
+  __bss_stop = .;
+  _end = . ;
+  /* Sections to be discarded */
+  /DISCARD/ : {
+        *(.exitcall.exit)
+        *(.eh_frame)
+        }
+  STABS_DEBUG
+  DWARF_DEBUG
+}
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
new file mode 100644
index 000000000000..414caf0c5f9a
--- /dev/null
+++ b/arch/x86/kernel/vsmp_64.c
@@ -0,0 +1,49 @@
+/*
+ * vSMPowered(tm) systems specific initialization
+ * Copyright (C) 2005 ScaleMP Inc.
+ *
+ * Use of this code is subject to the terms and conditions of the
+ * GNU general public license version 2. See "COPYING" or
+ * http://www.gnu.org/licenses/gpl.html
+ *
+ * Ravikiran Thirumalai <kiran@scalemp.com>,
+ * Shai Fultheim <shai@scalemp.com>
+ */
+#include <linux/init.h>
+#include <linux/pci_ids.h>
+#include <linux/pci_regs.h>
+#include <asm/pci-direct.h>
+#include <asm/io.h>
+static int __init vsmp_init(void)
+{
+        void *address;
+        unsigned int cap, ctl;
+        if (!early_pci_allowed())
+                return 0;
+        /* Check if we are running on a ScaleMP vSMP box */
+        if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
+            (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
+                return 0;
+        /* set vSMP magic bits to indicate vSMP capable kernel */
+        address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
+        cap = readl(address);
+        ctl = readl(address + 4);
+        printk("vSMP CTL: capabilities:0x%08x  control:0x%08x\n", cap, ctl);
+        if (cap & ctl & (1 << 4)) {
+                /* Turn on vSMP IRQ fastpath handling (see system.h) */
+                ctl &= ~(1 << 4);
+                writel(ctl, address + 4);
+                ctl = readl(address + 4);
+                printk("vSMP CTL: control set to:0x%08x\n", ctl);
+        }
+        iounmap(address);
+        return 0;
+}
+core_initcall(vsmp_init);
diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/kernel/vsyscall-int80_32.S
new file mode 100644
index 000000000000..103cab6aa7c0
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-int80_32.S
@@ -0,0 +1,53 @@
+/*
+ * Code for the vsyscall page.  This version uses the old int $0x80 method.
+ *
+ * NOTE:
+ * 1) __kernel_vsyscall _must_ be first in this page.
+ * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
+ *    for details.
+ */
+        .text
+        .globl __kernel_vsyscall
+        .type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+        int $0x80
+        ret
+.LEND_vsyscall:
+        .size __kernel_vsyscall,.-.LSTART_vsyscall
+        .previous
+        .section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI:
+        .long .LENDCIEDLSI-.LSTARTCIEDLSI
+.LSTARTCIEDLSI:
+        .long 0                 /* CIE ID */
+        .byte 1                 /* Version number */
+        .string "zR"            /* NUL-terminated augmentation string */
+        .uleb128 1              /* Code alignment factor */
+        .sleb128 -4             /* Data alignment factor */
+        .byte 8                 /* Return address register column */
+        .uleb128 1              /* Augmentation value length */
+        .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+        .byte 0x0c              /* DW_CFA_def_cfa */
+        .uleb128 4
+        .uleb128 4
+        .byte 0x88              /* DW_CFA_offset, column 0x8 */
+        .uleb128 1
+        .align 4
+.LENDCIEDLSI:
+        .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
+.LSTARTFDEDLSI:
+        .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
+        .long .LSTART_vsyscall-.        /* PC-relative start address */
+        .long .LEND_vsyscall-.LSTART_vsyscall
+        .uleb128 0
+        .align 4
+.LENDFDEDLSI:
+        .previous
+/*
+ * Get the common code for the sigreturn entry points.
+ */
+#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/kernel/vsyscall-note_32.S
new file mode 100644
index 000000000000..fcf376a37f79
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-note_32.S
@@ -0,0 +1,45 @@
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+#include <linux/version.h>
+#include <linux/elfnote.h>
+/* Ideally this would use UTS_NAME, but using a quoted string here
+   doesn't work. Remember to change this when changing the
+   kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
+        .long LINUX_VERSION_CODE
+ELFNOTE_END
+#ifdef CONFIG_XEN
+/*
+ * Add a special note telling glibc's dynamic linker a fake hardware
+ * flavor that it will use to choose the search path for libraries in the
+ * same way it uses real hardware capabilities like "mmx".
+ * We supply "nosegneg" as the fake capability, to indicate that we
+ * do not like negative offsets in instructions using segment overrides,
+ * since we implement those inefficiently.  This makes it possible to
+ * install libraries optimized to avoid those access patterns in someplace
+ * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
+ * corresponding to the bits here is needed to make ldconfig work right.
+ * It should contain:
+ *      hwcap 1 nosegneg
+ * to match the mapping of bit to name that we give here.
+ *
+ * At runtime, the fake hardware feature will be considered to be present
+ * if its bit is set in the mask word.  So, we start with the mask 0, and
+ * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
+ */
+#include "../../x86/xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT.  */
+        .globl VDSO_NOTE_MASK
+ELFNOTE_START(GNU, 2, "a")
+        .long 1                 /* ncaps */
+VDSO_NOTE_MASK:
+        .long 0                 /* mask */
+        .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
+ELFNOTE_END
+#endif
diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/kernel/vsyscall-sigreturn_32.S
new file mode 100644
index 000000000000..a92262f41659
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-sigreturn_32.S
@@ -0,0 +1,143 @@
+/*
+ * Common code for the sigreturn entry points on the vsyscall page.
+ * So far this code is the same for both int80 and sysenter versions.
+ * This file is #include'd by vsyscall-*.S to define them after the
+ * vsyscall entry point.  The kernel assumes that the addresses of these
+ * routines are constant for all vsyscall implementations.
+ */
+#include <asm/unistd.h>
+#include <asm/asm-offsets.h>
+/* XXX
+   Should these be named "_sigtramp" or something?
+*/
+        .text
+        .org __kernel_vsyscall+32,0x90
+        .globl __kernel_sigreturn
+        .type __kernel_sigreturn,@function
+__kernel_sigreturn:
+.LSTART_sigreturn:
+        popl %eax               /* XXX does this mean it needs unwind info? */
+        movl $__NR_sigreturn, %eax
+        int $0x80
+.LEND_sigreturn:
+        .size __kernel_sigreturn,.-.LSTART_sigreturn
+        .balign 32
+        .globl __kernel_rt_sigreturn
+        .type __kernel_rt_sigreturn,@function
+__kernel_rt_sigreturn:
+.LSTART_rt_sigreturn:
+        movl $__NR_rt_sigreturn, %eax
+        int $0x80
+.LEND_rt_sigreturn:
+        .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
+        .balign 32
+        .previous
+        .section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI1:
+        .long .LENDCIEDLSI1-.LSTARTCIEDLSI1
+.LSTARTCIEDLSI1:
+        .long 0                 /* CIE ID */
+        .byte 1                 /* Version number */
+        .string "zRS"           /* NUL-terminated augmentation string */
+        .uleb128 1              /* Code alignment factor */
+        .sleb128 -4             /* Data alignment factor */
+        .byte 8                 /* Return address register column */
+        .uleb128 1              /* Augmentation value length */
+        .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+        .byte 0                 /* DW_CFA_nop */
+        .align 4
+.LENDCIEDLSI1:
+        .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
+.LSTARTFDEDLSI1:
+        .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
+        /* HACK: The dwarf2 unwind routines will subtract 1 from the
+           return address to get an address in the middle of the
+           presumed call instruction.  Since we didn't get here via
+           a call, we need to include the nop before the real start
+           to make up for it.  */
+        .long .LSTART_sigreturn-1-.     /* PC-relative start address */
+        .long .LEND_sigreturn-.LSTART_sigreturn+1
+        .uleb128 0                      /* Augmentation */
+        /* What follows are the instructions for the table generation.
+           We record the locations of each register saved.  This is
+           complicated by the fact that the "CFA" is always assumed to
+           be the value of the stack pointer in the caller.  This means
+           that we must define the CFA of this body of code to be the
+           saved value of the stack pointer in the sigcontext.  Which
+           also means that there is no fixed relation to the other 
+           saved registers, which means that we must use DW_CFA_expression
+           to compute their addresses.  It also means that when we 
+           adjust the stack with the popl, we have to do it all over again.  */
+#define do_cfa_expr(offset)                                             \
+        .byte 0x0f;                     /* DW_CFA_def_cfa_expression */ \
+        .uleb128 1f-0f;                 /*   length */                  \
+0:      .byte 0x74;                     /*     DW_OP_breg4 */           \
+        .sleb128 offset;                /*      offset */               \
+        .byte 0x06;                     /*     DW_OP_deref */           \
+1:
+#define do_expr(regno, offset)                                          \
+        .byte 0x10;                     /* DW_CFA_expression */         \
+        .uleb128 regno;                 /*   regno */                   \
+        .uleb128 1f-0f;                 /*   length */                  \
+0:      .byte 0x74;                     /*     DW_OP_breg4 */           \
+        .sleb128 offset;                /*       offset */              \
+1:
+        do_cfa_expr(SIGCONTEXT_esp+4)
+        do_expr(0, SIGCONTEXT_eax+4)
+        do_expr(1, SIGCONTEXT_ecx+4)
+        do_expr(2, SIGCONTEXT_edx+4)
+        do_expr(3, SIGCONTEXT_ebx+4)
+        do_expr(5, SIGCONTEXT_ebp+4)
+        do_expr(6, SIGCONTEXT_esi+4)
+        do_expr(7, SIGCONTEXT_edi+4)
+        do_expr(8, SIGCONTEXT_eip+4)
+        .byte 0x42      /* DW_CFA_advance_loc 2 -- nop; popl eax. */
+        do_cfa_expr(SIGCONTEXT_esp)
+        do_expr(0, SIGCONTEXT_eax)
+        do_expr(1, SIGCONTEXT_ecx)
+        do_expr(2, SIGCONTEXT_edx)
+        do_expr(3, SIGCONTEXT_ebx)
+        do_expr(5, SIGCONTEXT_ebp)
+        do_expr(6, SIGCONTEXT_esi)
+        do_expr(7, SIGCONTEXT_edi)
+        do_expr(8, SIGCONTEXT_eip)
+        .align 4
+.LENDFDEDLSI1:
+        .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
+.LSTARTFDEDLSI2:
+        .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
+        /* HACK: See above wrt unwind library assumptions.  */
+        .long .LSTART_rt_sigreturn-1-.  /* PC-relative start address */
+        .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
+        .uleb128 0                      /* Augmentation */
+        /* What follows are the instructions for the table generation.
+           We record the locations of each register saved.  This is
+           slightly less complicated than the above, since we don't
+           modify the stack pointer in the process.  */
+        do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp)
+        do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax)
+        do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx)
+        do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx)
+        do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx)
+        do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp)
+        do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi)
+        do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi)
+        do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip)
+        .align 4
+.LENDFDEDLSI2:
+        .previous
diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/kernel/vsyscall-sysenter_32.S
new file mode 100644
index 000000000000..ed879bf42995
--- /dev/null
+++ b/arch/x86/kernel/vsyscall-sysenter_32.S
@@ -0,0 +1,122 @@
+/*
+ * Code for the vsyscall page.  This version uses the sysenter instruction.
+ *
+ * NOTE:
+ * 1) __kernel_vsyscall _must_ be first in this page.
+ * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
+ *    for details.
+ */
+/*
+ * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
+ * %ecx itself for arg2. The pushing is because the sysexit instruction
+ * (found in entry.S) requires that we clobber %ecx with the desired %esp.
+ * User code might expect that %ecx is unclobbered though, as it would be
+ * for returning via the iret instruction, so we must push and pop.
+ *
+ * The caller puts arg3 in %edx, which the sysexit instruction requires
+ * for %eip. Thus, exactly as for arg2, we must push and pop.
+ *
+ * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
+ * instruction clobbers %esp, the user's %esp won't even survive entry
+ * into the kernel. We store %esp in %ebp. Code in entry.S must fetch
+ * arg6 from the stack.
+ *
+ * You can not use this vsyscall for the clone() syscall because the
+ * three dwords on the parent stack do not get copied to the child.
+ */
+        .text
+        .globl __kernel_vsyscall
+        .type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+        push %ecx
+.Lpush_ecx:
+        push %edx
+.Lpush_edx:
+        push %ebp
+.Lenter_kernel:
+        movl %esp,%ebp
+        sysenter
+        /* 7: align return point with nop's to make disassembly easier */
+        .space 7,0x90
+        /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
+        jmp .Lenter_kernel
+        /* 16: System call normal return point is here! */
+        .globl SYSENTER_RETURN  /* Symbol used by sysenter.c  */
+SYSENTER_RETURN:
+        pop %ebp
+.Lpop_ebp:
+        pop %edx
+.Lpop_edx:
+        pop %ecx
+.Lpop_ecx:
+        ret
+.LEND_vsyscall:
+        .size __kernel_vsyscall,.-.LSTART_vsyscall
+        .previous
+        .section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI:
+        .long .LENDCIEDLSI-.LSTARTCIEDLSI
+.LSTARTCIEDLSI:
+        .long 0                 /* CIE ID */
+        .byte 1                 /* Version number */
+        .string "zR"            /* NUL-terminated augmentation string */
+        .uleb128 1              /* Code alignment factor */
+        .sleb128 -4             /* Data alignment factor */
+        .byte 8                 /* Return address register column */
+        .uleb128 1              /* Augmentation value length */
+        .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+        .byte 0x0c              /* DW_CFA_def_cfa */
+        .uleb128 4
+        .uleb128 4
+        .byte 0x88              /* DW_CFA_offset, column 0x8 */
+        .uleb128 1
+        .align 4
+.LENDCIEDLSI:
+        .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
+.LSTARTFDEDLSI:
+        .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
+        .long .LSTART_vsyscall-.        /* PC-relative start address */
+        .long .LEND_vsyscall-.LSTART_vsyscall
+        .uleb128 0
+        /* What follows are the instructions for the table generation.
+           We have to record all changes of the stack pointer.  */
+        .byte 0x04              /* DW_CFA_advance_loc4 */
+        .long .Lpush_ecx-.LSTART_vsyscall
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .byte 0x08              /* RA at offset 8 now */
+        .byte 0x04              /* DW_CFA_advance_loc4 */
+        .long .Lpush_edx-.Lpush_ecx
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .byte 0x0c              /* RA at offset 12 now */
+        .byte 0x04              /* DW_CFA_advance_loc4 */
+        .long .Lenter_kernel-.Lpush_edx
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .byte 0x10              /* RA at offset 16 now */
+        .byte 0x85, 0x04        /* DW_CFA_offset %ebp -16 */
+        /* Finally the epilogue.  */
+        .byte 0x04              /* DW_CFA_advance_loc4 */
+        .long .Lpop_ebp-.Lenter_kernel
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .byte 0x0c              /* RA at offset 12 now */
+        .byte 0xc5              /* DW_CFA_restore %ebp */
+        .byte 0x04              /* DW_CFA_advance_loc4 */
+        .long .Lpop_edx-.Lpop_ebp
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .byte 0x08              /* RA at offset 8 now */
+        .byte 0x04              /* DW_CFA_advance_loc4 */
+        .long .Lpop_ecx-.Lpop_edx
+        .byte 0x0e              /* DW_CFA_def_cfa_offset */
+        .byte 0x04              /* RA at offset 4 now */
+        .align 4
+.LENDFDEDLSI:
+        .previous
+/*
+ * Get the common code for the sigreturn entry points.
+ */
+#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S
new file mode 100644
index 000000000000..a5ab3dc4fd25
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_32.S
@@ -0,0 +1,15 @@
+#include <linux/init.h>
+__INITDATA
+        .globl vsyscall_int80_start, vsyscall_int80_end
+vsyscall_int80_start:
+        .incbin "arch/x86/kernel/vsyscall-int80_32.so"
+vsyscall_int80_end:
+        .globl vsyscall_sysenter_start, vsyscall_sysenter_end
+vsyscall_sysenter_start:
+        .incbin "arch/x86/kernel/vsyscall-sysenter_32.so"
+vsyscall_sysenter_end:
+__FINIT
diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S
new file mode 100644
index 000000000000..4a8b0ed9b8fb
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_32.lds.S
@@ -0,0 +1,67 @@
+/*
+ * Linker script for vsyscall DSO.  The vsyscall page is an ELF shared
+ * object prelinked to its virtual address, and with only one read-only
+ * segment (that fits in one page).  This script controls its layout.
+ */
+#include <asm/asm-offsets.h>
+SECTIONS
+{
+  . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
+  .hash           : { *(.hash) }                :text
+  .gnu.hash       : { *(.gnu.hash) }
+  .dynsym         : { *(.dynsym) }
+  .dynstr         : { *(.dynstr) }
+  .gnu.version    : { *(.gnu.version) }
+  .gnu.version_d  : { *(.gnu.version_d) }
+  .gnu.version_r  : { *(.gnu.version_r) }
+  /* This linker script is used both with -r and with -shared.
+     For the layouts to match, we need to skip more than enough
+     space for the dynamic symbol table et al.  If this amount
+     is insufficient, ld -shared will barf.  Just increase it here.  */
+  . = VDSO_PRELINK_asm + 0x400;
+  .text           : { *(.text) }                :text =0x90909090
+  .note           : { *(.note.*) }              :text :note
+  .eh_frame_hdr   : { *(.eh_frame_hdr) }        :text :eh_frame_hdr
+  .eh_frame       : { KEEP (*(.eh_frame)) }     :text
+  .dynamic        : { *(.dynamic) }             :text :dynamic
+  .useless        : {
+        *(.got.plt) *(.got)
+        *(.data .data.* .gnu.linkonce.d.*)
+        *(.dynbss)
+        *(.bss .bss.* .gnu.linkonce.b.*)
+  }                                             :text
+}
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+  note PT_NOTE FLAGS(4); /* PF_R */
+  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+}
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+  LINUX_2.5 {
+    global:
+        __kernel_vsyscall;
+        __kernel_sigreturn;
+        __kernel_rt_sigreturn;
+    local: *;
+  };
+}
+/* The ELF entry point can be used to set the AT_SYSINFO value.  */
+ENTRY(__kernel_vsyscall);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
new file mode 100644
index 000000000000..06c34949bfdc
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -0,0 +1,349 @@
+/*
+ *  linux/arch/x86_64/kernel/vsyscall.c
+ *
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ *  Thanks to hpa@transmeta.com for some useful hint.
+ *  Special thanks to Ingo Molnar for his early experience with
+ *  a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
+ *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
+ *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
+ *  jumping out of line if necessary. We cannot add more with this
+ *  mechanism because older kernels won't return -ENOSYS.
+ *  If we want more than four we need a vDSO.
+ *
+ *  Note: the concept clashes with user mode linux. If you use UML and
+ *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
+ */
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/seqlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysctl.h>
+#include <linux/clocksource.h>
+#include <linux/getcpu.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/notifier.h>
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/unistd.h>
+#include <asm/fixmap.h>
+#include <asm/errno.h>
+#include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
+#include <asm/vgtod.h>
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __syscall_clobber "r11","rcx","memory"
+#define __pa_vsymbol(x)                 \
+        ({unsigned long v;              \
+        extern char __vsyscall_0;       \
+          asm("" : "=r" (v) : "0" (x)); \
+          ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
+/*
+ * vsyscall_gtod_data contains data that is :
+ * - readonly from vsyscalls
+ * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
+ * Try to keep this structure as small as possible to avoid cache line ping pongs
+ */
+int __vgetcpu_mode __section_vgetcpu_mode;
+struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
+{
+        .lock = SEQLOCK_UNLOCKED,
+        .sysctl_enabled = 1,
+};
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+{
+        unsigned long flags;
+        write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+        /* copy vsyscall data */
+        vsyscall_gtod_data.clock.vread = clock->vread;
+        vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
+        vsyscall_gtod_data.clock.mask = clock->mask;
+        vsyscall_gtod_data.clock.mult = clock->mult;
+        vsyscall_gtod_data.clock.shift = clock->shift;
+        vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
+        vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+        vsyscall_gtod_data.sys_tz = sys_tz;
+        vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+        vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+        write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+}
+/* RED-PEN may want to readd seq locking, but then the variable should be
+ * write-once.
+ */
+static __always_inline void do_get_tz(struct timezone * tz)
+{
+        *tz = __vsyscall_gtod_data.sys_tz;
+}
+static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+        int ret;
+        asm volatile("vsysc2: syscall"
+                : "=a" (ret)
+                : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
+                : __syscall_clobber );
+        return ret;
+}
+static __always_inline long time_syscall(long *t)
+{
+        long secs;
+        asm volatile("vsysc1: syscall"
+                : "=a" (secs)
+                : "0" (__NR_time),"D" (t) : __syscall_clobber);
+        return secs;
+}
+static __always_inline void do_vgettimeofday(struct timeval * tv)
+{
+        cycle_t now, base, mask, cycle_delta;
+        unsigned seq;
+        unsigned long mult, shift, nsec;
+        cycle_t (*vread)(void);
+        do {
+                seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+                vread = __vsyscall_gtod_data.clock.vread;
+                if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
+                        gettimeofday(tv,NULL);
+                        return;
+                }
+                now = vread();
+                base = __vsyscall_gtod_data.clock.cycle_last;
+                mask = __vsyscall_gtod_data.clock.mask;
+                mult = __vsyscall_gtod_data.clock.mult;
+                shift = __vsyscall_gtod_data.clock.shift;
+                tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
+                nsec = __vsyscall_gtod_data.wall_time_nsec;
+        } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+        /* calculate interval: */
+        cycle_delta = (now - base) & mask;
+        /* convert to nsecs: */
+        nsec += (cycle_delta * mult) >> shift;
+        while (nsec >= NSEC_PER_SEC) {
+                tv->tv_sec += 1;
+                nsec -= NSEC_PER_SEC;
+        }
+        tv->tv_usec = nsec / NSEC_PER_USEC;
+}
+int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
+{
+        if (tv)
+                do_vgettimeofday(tv);
+        if (tz)
+                do_get_tz(tz);
+        return 0;
+}
+/* This will break when the xtime seconds get inaccurate, but that is
+ * unlikely */
+time_t __vsyscall(1) vtime(time_t *t)
+{
+        struct timeval tv;
+        time_t result;
+        if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
+                return time_syscall(t);
+        vgettimeofday(&tv, 0);
+        result = tv.tv_sec;
+        if (t)
+                *t = result;
+        return result;
+}
+/* Fast way to get current CPU and node.
+   This helps to do per node and per CPU caches in user space.
+   The result is not guaranteed without CPU affinity, but usually
+   works out because the scheduler tries to keep a thread on the same
+   CPU.
+   tcache must point to a two element sized long array.
+   All arguments can be NULL. */
+long __vsyscall(2)
+vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+{
+        unsigned int dummy, p;
+        unsigned long j = 0;
+        /* Fast cache - only recompute value once per jiffies and avoid
+           relatively costly rdtscp/cpuid otherwise.
+           This works because the scheduler usually keeps the process
+           on the same CPU and this syscall doesn't guarantee its
+           results anyways.
+           We do this here because otherwise user space would do it on
+           its own in a likely inferior way (no access to jiffies).
+           If you don't like it pass NULL. */
+        if (tcache && tcache->blob[0] == (j = __jiffies)) {
+                p = tcache->blob[1];
+        } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
+                /* Load per CPU data from RDTSCP */
+                rdtscp(dummy, dummy, p);
+        } else {
+                /* Load per CPU data from GDT */
+                asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+        }
+        if (tcache) {
+                tcache->blob[0] = j;
+                tcache->blob[1] = p;
+        }
+        if (cpu)
+                *cpu = p & 0xfff;
+        if (node)
+                *node = p >> 12;
+        return 0;
+}
+long __vsyscall(3) venosys_1(void)
+{
+        return -ENOSYS;
+}
+#ifdef CONFIG_SYSCTL
+#define SYSCALL 0x050f
+#define NOP2    0x9090
+/*
+ * NOP out syscall in vsyscall page when not needed.
+ */
+static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        extern u16 vsysc1, vsysc2;
+        u16 __iomem *map1;
+        u16 __iomem *map2;
+        int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+        if (!write)
+                return ret;
+        /* gcc has some trouble with __va(__pa()), so just do it this
+           way. */
+        map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
+        if (!map1)
+                return -ENOMEM;
+        map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
+        if (!map2) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        if (!vsyscall_gtod_data.sysctl_enabled) {
+                writew(SYSCALL, map1);
+                writew(SYSCALL, map2);
+        } else {
+                writew(NOP2, map1);
+                writew(NOP2, map2);
+        }
+        iounmap(map2);
+out:
+        iounmap(map1);
+        return ret;
+}
+static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
+                                void __user *oldval, size_t __user *oldlenp,
+                                void __user *newval, size_t newlen)
+{
+        return -ENOSYS;
+}
+static ctl_table kernel_table2[] = {
+        { .ctl_name = 99, .procname = "vsyscall64",
+          .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
+          .mode = 0644,
+          .strategy = vsyscall_sysctl_nostrat,
+          .proc_handler = vsyscall_sysctl_change },
+        {}
+};
+static ctl_table kernel_root_table2[] = {
+        { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
+          .child = kernel_table2 },
+        {}
+};
+#endif
+/* Assume __initcall executes before all user space. Hopefully kmod
+   doesn't violate that. We'll find out if it does. */
+static void __cpuinit vsyscall_set_cpu(int cpu)
+{
+        unsigned long *d;
+        unsigned long node = 0;
+#ifdef CONFIG_NUMA
+        node = cpu_to_node[cpu];
+#endif
+        if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
+                write_rdtscp_aux((node << 12) | cpu);
+        /* Store cpu number in limit so that it can be loaded quickly
+           in user space in vgetcpu.
+           12 bits for the CPU and 8 bits for the node. */
+        d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
+        *d = 0x0f40000000000ULL;
+        *d |= cpu;
+        *d |= (node & 0xf) << 12;
+        *d |= (node >> 4) << 48;
+}
+static void __cpuinit cpu_vsyscall_init(void *arg)
+{
+        /* preemption should be already off */
+        vsyscall_set_cpu(raw_smp_processor_id());
+}
+static int __cpuinit
+cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
+{
+        long cpu = (long)arg;
+        if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
+                smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
+        return NOTIFY_DONE;
+}
+static void __init map_vsyscall(void)
+{
+        extern char __vsyscall_0;
+        unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+        /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
+        __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+}
+static int __init vsyscall_init(void)
+{
+        BUG_ON(((unsigned long) &vgettimeofday !=
+                        VSYSCALL_ADDR(__NR_vgettimeofday)));
+        BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
+        BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
+        BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
+        map_vsyscall();
+#ifdef CONFIG_SYSCTL
+        register_sysctl_table(kernel_root_table2);
+#endif
+        on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
+        hotcpu_notifier(cpu_vsyscall_notifier, 0);
+        return 0;
+}
+__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
new file mode 100644
index 000000000000..77c25b307635
--- /dev/null
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -0,0 +1,62 @@
+/* Exports for assembly files.
+   All C exports should go in the respective C files. */
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <asm/semaphore.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+EXPORT_SYMBOL(kernel_thread);
+EXPORT_SYMBOL(__down_failed);
+EXPORT_SYMBOL(__down_failed_interruptible);
+EXPORT_SYMBOL(__down_failed_trylock);
+EXPORT_SYMBOL(__up_wakeup);
+EXPORT_SYMBOL(__get_user_1);
+EXPORT_SYMBOL(__get_user_2);
+EXPORT_SYMBOL(__get_user_4);
+EXPORT_SYMBOL(__get_user_8);
+EXPORT_SYMBOL(__put_user_1);
+EXPORT_SYMBOL(__put_user_2);
+EXPORT_SYMBOL(__put_user_4);
+EXPORT_SYMBOL(__put_user_8);
+EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(__copy_user_nocache);
+EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(copy_to_user);
+EXPORT_SYMBOL(__copy_from_user_inatomic);
+EXPORT_SYMBOL(copy_page);
+EXPORT_SYMBOL(clear_page);
+#ifdef CONFIG_SMP
+extern void  __write_lock_failed(rwlock_t *rw);
+extern void  __read_lock_failed(rwlock_t *rw);
+EXPORT_SYMBOL(__write_lock_failed);
+EXPORT_SYMBOL(__read_lock_failed);
+#endif
+/* Export string functions. We normally rely on gcc builtin for most of these,
+   but gcc sometimes decides not to inline them. */    
+#undef memcpy
+#undef memset
+#undef memmove
+extern void * memset(void *,int,__kernel_size_t);
+extern void * memcpy(void *,const void *,__kernel_size_t);
+extern void * __memcpy(void *,const void *,__kernel_size_t);
+EXPORT_SYMBOL(memset);
+EXPORT_SYMBOL(memcpy);
+EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(empty_zero_page);
+EXPORT_SYMBOL(init_level4_pgt);
+EXPORT_SYMBOL(load_gs_index);
+EXPORT_SYMBOL(_proxy_pda);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
new file mode 100644
index 000000000000..329da276c6f1
--- /dev/null
+++ b/arch/x86/lib/Makefile
@@ -0,0 +1,5 @@
+ifeq ($(CONFIG_X86_32),y)
+include ${srctree}/arch/x86/lib/Makefile_32
+else
+include ${srctree}/arch/x86/lib/Makefile_64
+endif
diff --git a/arch/x86/lib/Makefile_32 b/arch/x86/lib/Makefile_32
new file mode 100644
index 000000000000..98d1f1e2e2ef
--- /dev/null
+++ b/arch/x86/lib/Makefile_32
@@ -0,0 +1,11 @@
+#
+# Makefile for i386-specific library files..
+#
+lib-y = checksum_32.o delay_32.o usercopy_32.o getuser_32.o putuser_32.o memcpy_32.o strstr_32.o \
+        bitops_32.o semaphore_32.o string_32.o
+lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
+obj-$(CONFIG_SMP)       += msr-on-cpu.o
diff --git a/arch/x86/lib/Makefile_64 b/arch/x86/lib/Makefile_64
new file mode 100644
index 000000000000..bbabad3c9335
--- /dev/null
+++ b/arch/x86/lib/Makefile_64
@@ -0,0 +1,13 @@
+#
+# Makefile for x86_64-specific library files.
+#
+CFLAGS_csum-partial_64.o := -funroll-loops
+obj-y := io_64.o iomap_copy_64.o
+obj-$(CONFIG_SMP)       += msr-on-cpu.o
+lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
+        usercopy_64.o getuser_64.o putuser_64.o  \
+        thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
+lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c
new file mode 100644
index 000000000000..afd0045595d4
--- /dev/null
+++ b/arch/x86/lib/bitops_32.c
@@ -0,0 +1,70 @@
+#include <linux/bitops.h>
+#include <linux/module.h>
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+int find_next_bit(const unsigned long *addr, int size, int offset)
+{
+        const unsigned long *p = addr + (offset >> 5);
+        int set = 0, bit = offset & 31, res;
+        if (bit) {
+                /*
+                 * Look for nonzero in the first 32 bits:
+                 */
+                __asm__("bsfl %1,%0\n\t"
+                        "jne 1f\n\t"
+                        "movl $32, %0\n"
+                        "1:"
+                        : "=r" (set)
+                        : "r" (*p >> bit));
+                if (set < (32 - bit))
+                        return set + offset;
+                set = 32 - bit;
+                p++;
+        }
+        /*
+         * No set bit yet, search remaining full words for a bit
+         */
+        res = find_first_bit (p, size - 32 * (p - addr));
+        return (offset + set + res);
+}
+EXPORT_SYMBOL(find_next_bit);
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+int find_next_zero_bit(const unsigned long *addr, int size, int offset)
+{
+        const unsigned long *p = addr + (offset >> 5);
+        int set = 0, bit = offset & 31, res;
+        if (bit) {
+                /*
+                 * Look for zero in the first 32 bits.
+                 */
+                __asm__("bsfl %1,%0\n\t"
+                        "jne 1f\n\t"
+                        "movl $32, %0\n"
+                        "1:"
+                        : "=r" (set)
+                        : "r" (~(*p >> bit)));
+                if (set < (32 - bit))
+                        return set + offset;
+                set = 32 - bit;
+                p++;
+        }
+        /*
+         * No zero yet, search remaining full bytes for a zero
+         */
+        res = find_first_zero_bit(p, size - 32 * (p - addr));
+        return (offset + set + res);
+}
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c
new file mode 100644
index 000000000000..95b6d9639fba
--- /dev/null
+++ b/arch/x86/lib/bitops_64.c
@@ -0,0 +1,175 @@
+#include <linux/bitops.h>
+#undef find_first_zero_bit
+#undef find_next_zero_bit
+#undef find_first_bit
+#undef find_next_bit
+static inline long
+__find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+        long d0, d1, d2;
+        long res;
+        /*
+         * We must test the size in words, not in bits, because
+         * otherwise incoming sizes in the range -63..-1 will not run
+         * any scasq instructions, and then the flags used by the je
+         * instruction will have whatever random value was in place
+         * before.  Nobody should call us like that, but
+         * find_next_zero_bit() does when offset and size are at the
+         * same word and it fails to find a zero itself.
+         */
+        size += 63;
+        size >>= 6;
+        if (!size)
+                return 0;
+        asm volatile(
+                "  repe; scasq\n"
+                "  je 1f\n"
+                "  xorq -8(%%rdi),%%rax\n"
+                "  subq $8,%%rdi\n"
+                "  bsfq %%rax,%%rdx\n"
+                "1:  subq %[addr],%%rdi\n"
+                "  shlq $3,%%rdi\n"
+                "  addq %%rdi,%%rdx"
+                :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
+                :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
+                 [addr] "S" (addr) : "memory");
+        /*
+         * Any register would do for [addr] above, but GCC tends to
+         * prefer rbx over rsi, even though rsi is readily available
+         * and doesn't have to be saved.
+         */
+        return res;
+}
+/**
+ * find_first_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first zero bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+        return __find_first_zero_bit (addr, size);
+}
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_zero_bit (const unsigned long * addr, long size, long offset)
+{
+        const unsigned long * p = addr + (offset >> 6);
+        unsigned long set = 0;
+        unsigned long res, bit = offset&63;
+        if (bit) {
+                /*
+                 * Look for zero in first word
+                 */
+                asm("bsfq %1,%0\n\t"
+                    "cmoveq %2,%0"
+                    : "=r" (set)
+                    : "r" (~(*p >> bit)), "r"(64L));
+                if (set < (64 - bit))
+                        return set + offset;
+                set = 64 - bit;
+                p++;
+        }
+        /*
+         * No zero yet, search remaining full words for a zero
+         */
+        res = __find_first_zero_bit (p, size - 64 * (p - addr));
+        return (offset + set + res);
+}
+static inline long
+__find_first_bit(const unsigned long * addr, unsigned long size)
+{
+        long d0, d1;
+        long res;
+        /*
+         * We must test the size in words, not in bits, because
+         * otherwise incoming sizes in the range -63..-1 will not run
+         * any scasq instructions, and then the flags used by the jz
+         * instruction will have whatever random value was in place
+         * before.  Nobody should call us like that, but
+         * find_next_bit() does when offset and size are at the same
+         * word and it fails to find a one itself.
+         */
+        size += 63;
+        size >>= 6;
+        if (!size)
+                return 0;
+        asm volatile(
+                "   repe; scasq\n"
+                "   jz 1f\n"
+                "   subq $8,%%rdi\n"
+                "   bsfq (%%rdi),%%rax\n"
+                "1: subq %[addr],%%rdi\n"
+                "   shlq $3,%%rdi\n"
+                "   addq %%rdi,%%rax"
+                :"=a" (res), "=&c" (d0), "=&D" (d1)
+                :"0" (0ULL), "1" (size), "2" (addr),
+                 [addr] "r" (addr) : "memory");
+        return res;
+}
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_bit(const unsigned long * addr, unsigned long size)
+{
+        return __find_first_bit(addr,size);
+}
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_bit(const unsigned long * addr, long size, long offset)
+{
+        const unsigned long * p = addr + (offset >> 6);
+        unsigned long set = 0, bit = offset & 63, res;
+        if (bit) {
+                /*
+                 * Look for nonzero in the first 64 bits:
+                 */
+                asm("bsfq %1,%0\n\t"
+                    "cmoveq %2,%0\n\t"
+                    : "=r" (set)
+                    : "r" (*p >> bit), "r" (64L));
+                if (set < (64 - bit))
+                        return set + offset;
+                set = 64 - bit;
+                p++;
+        }
+        /*
+         * No set bit yet, search remaining full words for a bit
+         */
+        res = __find_first_bit (p, size - 64 * (p - addr));
+        return (offset + set + res);
+}
+#include <linux/module.h>
+EXPORT_SYMBOL(find_next_bit);
+EXPORT_SYMBOL(find_first_bit);
+EXPORT_SYMBOL(find_first_zero_bit);
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/bitstr_64.c b/arch/x86/lib/bitstr_64.c
new file mode 100644
index 000000000000..24676609a6ac
--- /dev/null
+++ b/arch/x86/lib/bitstr_64.c
@@ -0,0 +1,28 @@
+#include <linux/module.h>
+#include <linux/bitops.h>
+/* Find string of zero bits in a bitmap */ 
+unsigned long 
+find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
+{ 
+        unsigned long n, end, i;        
+ again:
+        n = find_next_zero_bit(bitmap, nbits, start);
+        if (n == -1) 
+                return -1;
+        
+        /* could test bitsliced, but it's hardly worth it */
+        end = n+len;
+        if (end >= nbits) 
+                return -1; 
+        for (i = n+1; i < end; i++) { 
+                if (test_bit(i, bitmap)) {  
+                        start = i+1; 
+                        goto again; 
+                } 
+        }
+        return n;
+}
+EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
new file mode 100644
index 000000000000..adbccd0bbb78
--- /dev/null
+++ b/arch/x86/lib/checksum_32.S
@@ -0,0 +1,546 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              IP/TCP/UDP checksumming routines
+ *
+ * Authors:     Jorge Cwik, <jorge@laser.satlink.net>
+ *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *              Tom May, <ftom@netcom.com>
+ *              Pentium Pro/II routines:
+ *              Alexander Kjeldaas <astor@guardian.no>
+ *              Finn Arne Gangstad <finnag@guardian.no>
+ *              Lots of code moved from tcp.c and ip.c; see those files
+ *              for more names.
+ *
+ * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ *                           handling.
+ *              Andi Kleen,  add zeroing on error
+ *                   converted to pure assembler
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/errno.h>
+                                
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+/*      
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+ */
+                
+.text
+                
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+          /*            
+           * Experiments with Ethernet and SLIP connections show that buff
+           * is aligned on either a 2-byte or 4-byte boundary.  We get at
+           * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+           * Fortunately, it is easy to convert 2-byte alignment to 4-byte
+           * alignment for the unrolled loop.
+           */           
+ENTRY(csum_partial)
+        CFI_STARTPROC
+        pushl %esi
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET esi, 0
+        pushl %ebx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ebx, 0
+        movl 20(%esp),%eax      # Function arg: unsigned int sum
+        movl 16(%esp),%ecx      # Function arg: int len
+        movl 12(%esp),%esi      # Function arg: unsigned char *buff
+        testl $3, %esi          # Check alignment.
+        jz 2f                   # Jump if alignment is ok.
+        testl $1, %esi          # Check alignment.
+        jz 10f                  # Jump if alignment is boundary of 2bytes.
+        # buf is odd
+        dec %ecx
+        jl 8f
+        movzbl (%esi), %ebx
+        adcl %ebx, %eax
+        roll $8, %eax
+        inc %esi
+        testl $2, %esi
+        jz 2f
+10:
+        subl $2, %ecx           # Alignment uses up two bytes.
+        jae 1f                  # Jump if we had at least two bytes.
+        addl $2, %ecx           # ecx was < 2.  Deal with it.
+        jmp 4f
+1:      movw (%esi), %bx
+        addl $2, %esi
+        addw %bx, %ax
+        adcl $0, %eax
+2:
+        movl %ecx, %edx
+        shrl $5, %ecx
+        jz 2f
+        testl %esi, %esi
+1:      movl (%esi), %ebx
+        adcl %ebx, %eax
+        movl 4(%esi), %ebx
+        adcl %ebx, %eax
+        movl 8(%esi), %ebx
+        adcl %ebx, %eax
+        movl 12(%esi), %ebx
+        adcl %ebx, %eax
+        movl 16(%esi), %ebx
+        adcl %ebx, %eax
+        movl 20(%esi), %ebx
+        adcl %ebx, %eax
+        movl 24(%esi), %ebx
+        adcl %ebx, %eax
+        movl 28(%esi), %ebx
+        adcl %ebx, %eax
+        lea 32(%esi), %esi
+        dec %ecx
+        jne 1b
+        adcl $0, %eax
+2:      movl %edx, %ecx
+        andl $0x1c, %edx
+        je 4f
+        shrl $2, %edx           # This clears CF
+3:      adcl (%esi), %eax
+        lea 4(%esi), %esi
+        dec %edx
+        jne 3b
+        adcl $0, %eax
+4:      andl $3, %ecx
+        jz 7f
+        cmpl $2, %ecx
+        jb 5f
+        movw (%esi),%cx
+        leal 2(%esi),%esi
+        je 6f
+        shll $16,%ecx
+5:      movb (%esi),%cl
+6:      addl %ecx,%eax
+        adcl $0, %eax 
+7:      
+        testl $1, 12(%esp)
+        jz 8f
+        roll $8, %eax
+8:
+        popl %ebx
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE ebx
+        popl %esi
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE esi
+        ret
+        CFI_ENDPROC
+ENDPROC(csum_partial)
+#else
+/* Version for PentiumII/PPro */
+ENTRY(csum_partial)
+        CFI_STARTPROC
+        pushl %esi
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET esi, 0
+        pushl %ebx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ebx, 0
+        movl 20(%esp),%eax      # Function arg: unsigned int sum
+        movl 16(%esp),%ecx      # Function arg: int len
+        movl 12(%esp),%esi      # Function arg: const unsigned char *buf
+        testl $3, %esi         
+        jnz 25f                 
+10:
+        movl %ecx, %edx
+        movl %ecx, %ebx
+        andl $0x7c, %ebx
+        shrl $7, %ecx
+        addl %ebx,%esi
+        shrl $2, %ebx  
+        negl %ebx
+        lea 45f(%ebx,%ebx,2), %ebx
+        testl %esi, %esi
+        jmp *%ebx
+        # Handle 2-byte-aligned regions
+20:     addw (%esi), %ax
+        lea 2(%esi), %esi
+        adcl $0, %eax
+        jmp 10b
+25:
+        testl $1, %esi         
+        jz 30f                 
+        # buf is odd
+        dec %ecx
+        jl 90f
+        movzbl (%esi), %ebx
+        addl %ebx, %eax
+        adcl $0, %eax
+        roll $8, %eax
+        inc %esi
+        testl $2, %esi
+        jz 10b
+30:     subl $2, %ecx          
+        ja 20b                 
+        je 32f
+        addl $2, %ecx
+        jz 80f
+        movzbl (%esi),%ebx      # csumming 1 byte, 2-aligned
+        addl %ebx, %eax
+        adcl $0, %eax
+        jmp 80f
+32:
+        addw (%esi), %ax        # csumming 2 bytes, 2-aligned
+        adcl $0, %eax
+        jmp 80f
+40: 
+        addl -128(%esi), %eax
+        adcl -124(%esi), %eax
+        adcl -120(%esi), %eax
+        adcl -116(%esi), %eax   
+        adcl -112(%esi), %eax   
+        adcl -108(%esi), %eax
+        adcl -104(%esi), %eax
+        adcl -100(%esi), %eax
+        adcl -96(%esi), %eax
+        adcl -92(%esi), %eax
+        adcl -88(%esi), %eax
+        adcl -84(%esi), %eax
+        adcl -80(%esi), %eax
+        adcl -76(%esi), %eax
+        adcl -72(%esi), %eax
+        adcl -68(%esi), %eax
+        adcl -64(%esi), %eax     
+        adcl -60(%esi), %eax     
+        adcl -56(%esi), %eax     
+        adcl -52(%esi), %eax   
+        adcl -48(%esi), %eax   
+        adcl -44(%esi), %eax
+        adcl -40(%esi), %eax
+        adcl -36(%esi), %eax
+        adcl -32(%esi), %eax
+        adcl -28(%esi), %eax
+        adcl -24(%esi), %eax
+        adcl -20(%esi), %eax
+        adcl -16(%esi), %eax
+        adcl -12(%esi), %eax
+        adcl -8(%esi), %eax
+        adcl -4(%esi), %eax
+45:
+        lea 128(%esi), %esi
+        adcl $0, %eax
+        dec %ecx
+        jge 40b
+        movl %edx, %ecx
+50:     andl $3, %ecx
+        jz 80f
+        # Handle the last 1-3 bytes without jumping
+        notl %ecx               # 1->2, 2->1, 3->0, higher bits are masked
+        movl $0xffffff,%ebx     # by the shll and shrl instructions
+        shll $3,%ecx
+        shrl %cl,%ebx
+        andl -128(%esi),%ebx    # esi is 4-aligned so should be ok
+        addl %ebx,%eax
+        adcl $0,%eax
+80: 
+        testl $1, 12(%esp)
+        jz 90f
+        roll $8, %eax
+90: 
+        popl %ebx
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE ebx
+        popl %esi
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE esi
+        ret
+        CFI_ENDPROC
+ENDPROC(csum_partial)
+                                
+#endif
+/*
+unsigned int csum_partial_copy_generic (const char *src, char *dst,
+                                  int len, int sum, int *src_err_ptr, int *dst_err_ptr)
+ */ 
+/*
+ * Copy from ds while checksumming, otherwise like csum_partial
+ *
+ * The macros SRC and DST specify the type of access for the instruction.
+ * thus we can call a custom exception handler for all access types.
+ *
+ * FIXME: could someone double-check whether I haven't mixed up some SRC and
+ *        DST definitions? It's damn hard to trigger all cases.  I hope I got
+ *        them all but there's no guarantee.
+ */
+#define SRC(y...)                       \
+        9999: y;                        \
+        .section __ex_table, "a";       \
+        .long 9999b, 6001f      ;       \
+        .previous
+#define DST(y...)                       \
+        9999: y;                        \
+        .section __ex_table, "a";       \
+        .long 9999b, 6002f      ;       \
+        .previous
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+#define ARGBASE 16              
+#define FP              12
+                
+ENTRY(csum_partial_copy_generic)
+        CFI_STARTPROC
+        subl  $4,%esp   
+        CFI_ADJUST_CFA_OFFSET 4
+        pushl %edi
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET edi, 0
+        pushl %esi
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET esi, 0
+        pushl %ebx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ebx, 0
+        movl ARGBASE+16(%esp),%eax      # sum
+        movl ARGBASE+12(%esp),%ecx      # len
+        movl ARGBASE+4(%esp),%esi       # src
+        movl ARGBASE+8(%esp),%edi       # dst
+        testl $2, %edi                  # Check alignment. 
+        jz 2f                           # Jump if alignment is ok.
+        subl $2, %ecx                   # Alignment uses up two bytes.
+        jae 1f                          # Jump if we had at least two bytes.
+        addl $2, %ecx                   # ecx was < 2.  Deal with it.
+        jmp 4f
+SRC(1:  movw (%esi), %bx        )
+        addl $2, %esi
+DST(    movw %bx, (%edi)        )
+        addl $2, %edi
+        addw %bx, %ax   
+        adcl $0, %eax
+2:
+        movl %ecx, FP(%esp)
+        shrl $5, %ecx
+        jz 2f
+        testl %esi, %esi
+SRC(1:  movl (%esi), %ebx       )
+SRC(    movl 4(%esi), %edx      )
+        adcl %ebx, %eax
+DST(    movl %ebx, (%edi)       )
+        adcl %edx, %eax
+DST(    movl %edx, 4(%edi)      )
+SRC(    movl 8(%esi), %ebx      )
+SRC(    movl 12(%esi), %edx     )
+        adcl %ebx, %eax
+DST(    movl %ebx, 8(%edi)      )
+        adcl %edx, %eax
+DST(    movl %edx, 12(%edi)     )
+SRC(    movl 16(%esi), %ebx     )
+SRC(    movl 20(%esi), %edx     )
+        adcl %ebx, %eax
+DST(    movl %ebx, 16(%edi)     )
+        adcl %edx, %eax
+DST(    movl %edx, 20(%edi)     )
+SRC(    movl 24(%esi), %ebx     )
+SRC(    movl 28(%esi), %edx     )
+        adcl %ebx, %eax
+DST(    movl %ebx, 24(%edi)     )
+        adcl %edx, %eax
+DST(    movl %edx, 28(%edi)     )
+        lea 32(%esi), %esi
+        lea 32(%edi), %edi
+        dec %ecx
+        jne 1b
+        adcl $0, %eax
+2:      movl FP(%esp), %edx
+        movl %edx, %ecx
+        andl $0x1c, %edx
+        je 4f
+        shrl $2, %edx                   # This clears CF
+SRC(3:  movl (%esi), %ebx       )
+        adcl %ebx, %eax
+DST(    movl %ebx, (%edi)       )
+        lea 4(%esi), %esi
+        lea 4(%edi), %edi
+        dec %edx
+        jne 3b
+        adcl $0, %eax
+4:      andl $3, %ecx
+        jz 7f
+        cmpl $2, %ecx
+        jb 5f
+SRC(    movw (%esi), %cx        )
+        leal 2(%esi), %esi
+DST(    movw %cx, (%edi)        )
+        leal 2(%edi), %edi
+        je 6f
+        shll $16,%ecx
+SRC(5:  movb (%esi), %cl        )
+DST(    movb %cl, (%edi)        )
+6:      addl %ecx, %eax
+        adcl $0, %eax
+7:
+5000:
+# Exception handler:
+.section .fixup, "ax"                                                   
+6001:
+        movl ARGBASE+20(%esp), %ebx     # src_err_ptr
+        movl $-EFAULT, (%ebx)
+        # zero the complete destination - computing the rest
+        # is too much work 
+        movl ARGBASE+8(%esp), %edi      # dst
+        movl ARGBASE+12(%esp), %ecx     # len
+        xorl %eax,%eax
+        rep ; stosb
+        jmp 5000b
+6002:
+        movl ARGBASE+24(%esp), %ebx     # dst_err_ptr
+        movl $-EFAULT,(%ebx)
+        jmp 5000b
+.previous
+        popl %ebx
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE ebx
+        popl %esi
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE esi
+        popl %edi
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE edi
+        popl %ecx                       # equivalent to addl $4,%esp
+        CFI_ADJUST_CFA_OFFSET -4
+        ret     
+        CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
+#else
+/* Version for PentiumII/PPro */
+#define ROUND1(x) \
+        SRC(movl x(%esi), %ebx  )       ;       \
+        addl %ebx, %eax                 ;       \
+        DST(movl %ebx, x(%edi)  )       ; 
+#define ROUND(x) \
+        SRC(movl x(%esi), %ebx  )       ;       \
+        adcl %ebx, %eax                 ;       \
+        DST(movl %ebx, x(%edi)  )       ;
+#define ARGBASE 12
+                
+ENTRY(csum_partial_copy_generic)
+        CFI_STARTPROC
+        pushl %ebx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ebx, 0
+        pushl %edi
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET edi, 0
+        pushl %esi
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET esi, 0
+        movl ARGBASE+4(%esp),%esi       #src
+        movl ARGBASE+8(%esp),%edi       #dst    
+        movl ARGBASE+12(%esp),%ecx      #len
+        movl ARGBASE+16(%esp),%eax      #sum
+#       movl %ecx, %edx  
+        movl %ecx, %ebx  
+        movl %esi, %edx
+        shrl $6, %ecx     
+        andl $0x3c, %ebx  
+        negl %ebx
+        subl %ebx, %esi  
+        subl %ebx, %edi  
+        lea  -1(%esi),%edx
+        andl $-32,%edx
+        lea 3f(%ebx,%ebx), %ebx
+        testl %esi, %esi 
+        jmp *%ebx
+1:      addl $64,%esi
+        addl $64,%edi 
+        SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
+        ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)    
+        ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)    
+        ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)    
+        ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)     
+3:      adcl $0,%eax
+        addl $64, %edx
+        dec %ecx
+        jge 1b
+4:      movl ARGBASE+12(%esp),%edx      #len
+        andl $3, %edx
+        jz 7f
+        cmpl $2, %edx
+        jb 5f
+SRC(    movw (%esi), %dx         )
+        leal 2(%esi), %esi
+DST(    movw %dx, (%edi)         )
+        leal 2(%edi), %edi
+        je 6f
+        shll $16,%edx
+5:
+SRC(    movb (%esi), %dl         )
+DST(    movb %dl, (%edi)         )
+6:      addl %edx, %eax
+        adcl $0, %eax
+7:
+.section .fixup, "ax"
+6001:   movl    ARGBASE+20(%esp), %ebx  # src_err_ptr   
+        movl $-EFAULT, (%ebx)
+        # zero the complete destination (computing the rest is too much work)
+        movl ARGBASE+8(%esp),%edi       # dst
+        movl ARGBASE+12(%esp),%ecx      # len
+        xorl %eax,%eax
+        rep; stosb
+        jmp 7b
+6002:   movl ARGBASE+24(%esp), %ebx     # dst_err_ptr
+        movl $-EFAULT, (%ebx)
+        jmp  7b                 
+.previous                               
+        popl %esi
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE esi
+        popl %edi
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE edi
+        popl %ebx
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE ebx
+        ret
+        CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
+                                
+#undef ROUND
+#undef ROUND1           
+                
+#endif
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
new file mode 100644
index 000000000000..9a10a78bb4a4
--- /dev/null
+++ b/arch/x86/lib/clear_page_64.S
@@ -0,0 +1,59 @@
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+/*
+ * Zero a page.         
+ * rdi  page
+ */                     
+        ALIGN
+clear_page_c:
+        CFI_STARTPROC
+        movl $4096/8,%ecx
+        xorl %eax,%eax
+        rep stosq
+        ret
+        CFI_ENDPROC
+ENDPROC(clear_page)
+ENTRY(clear_page)
+        CFI_STARTPROC
+        xorl   %eax,%eax
+        movl   $4096/64,%ecx
+        .p2align 4
+.Lloop:
+        decl    %ecx
+#define PUT(x) movq %rax,x*8(%rdi)
+        movq %rax,(%rdi)
+        PUT(1)
+        PUT(2)
+        PUT(3)
+        PUT(4)
+        PUT(5)
+        PUT(6)
+        PUT(7)
+        leaq    64(%rdi),%rdi
+        jnz     .Lloop
+        nop
+        ret
+        CFI_ENDPROC
+.Lclear_page_end:
+ENDPROC(clear_page)
+        /* Some CPUs run faster using the string instructions.
+           It is also a lot simpler. Use this when possible */
+#include <asm/cpufeature.h>
+        .section .altinstr_replacement,"ax"
+1:      .byte 0xeb                                      /* jmp <disp8> */
+        .byte (clear_page_c - clear_page) - (2f - 1b)   /* offset */
+2:
+        .previous
+        .section .altinstructions,"a"
+        .align 8
+        .quad clear_page
+        .quad 1b
+        .byte X86_FEATURE_REP_GOOD
+        .byte .Lclear_page_end - clear_page
+        .byte 2b - 1b
+        .previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
new file mode 100644
index 000000000000..727a5d46d2fc
--- /dev/null
+++ b/arch/x86/lib/copy_page_64.S
@@ -0,0 +1,119 @@
+/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+        ALIGN
+copy_page_c:
+        CFI_STARTPROC
+        movl $4096/8,%ecx
+        rep movsq
+        ret
+        CFI_ENDPROC
+ENDPROC(copy_page_c)
+/* Don't use streaming store because it's better when the target
+   ends up in cache. */
+            
+/* Could vary the prefetch distance based on SMP/UP */
+ENTRY(copy_page)
+        CFI_STARTPROC
+        subq    $3*8,%rsp
+        CFI_ADJUST_CFA_OFFSET 3*8
+        movq    %rbx,(%rsp)
+        CFI_REL_OFFSET rbx, 0
+        movq    %r12,1*8(%rsp)
+        CFI_REL_OFFSET r12, 1*8
+        movq    %r13,2*8(%rsp)
+        CFI_REL_OFFSET r13, 2*8
+        movl    $(4096/64)-5,%ecx
+        .p2align 4
+.Loop64:
+        dec     %rcx
+        movq        (%rsi), %rax
+        movq      8 (%rsi), %rbx
+        movq     16 (%rsi), %rdx
+        movq     24 (%rsi), %r8
+        movq     32 (%rsi), %r9
+        movq     40 (%rsi), %r10
+        movq     48 (%rsi), %r11
+        movq     56 (%rsi), %r12
+        prefetcht0 5*64(%rsi)
+        movq     %rax,    (%rdi)
+        movq     %rbx,  8 (%rdi)
+        movq     %rdx, 16 (%rdi)
+        movq     %r8,  24 (%rdi)
+        movq     %r9,  32 (%rdi)
+        movq     %r10, 40 (%rdi)
+        movq     %r11, 48 (%rdi)
+        movq     %r12, 56 (%rdi)
+        leaq    64 (%rsi), %rsi
+        leaq    64 (%rdi), %rdi
+        jnz     .Loop64
+        movl    $5,%ecx
+        .p2align 4
+.Loop2:
+        decl   %ecx
+        movq        (%rsi), %rax
+        movq      8 (%rsi), %rbx
+        movq     16 (%rsi), %rdx
+        movq     24 (%rsi), %r8
+        movq     32 (%rsi), %r9
+        movq     40 (%rsi), %r10
+        movq     48 (%rsi), %r11
+        movq     56 (%rsi), %r12
+        movq     %rax,    (%rdi)
+        movq     %rbx,  8 (%rdi)
+        movq     %rdx, 16 (%rdi)
+        movq     %r8,  24 (%rdi)
+        movq     %r9,  32 (%rdi)
+        movq     %r10, 40 (%rdi)
+        movq     %r11, 48 (%rdi)
+        movq     %r12, 56 (%rdi)
+        leaq    64(%rdi),%rdi
+        leaq    64(%rsi),%rsi
+        jnz     .Loop2
+        movq    (%rsp),%rbx
+        CFI_RESTORE rbx
+        movq    1*8(%rsp),%r12
+        CFI_RESTORE r12
+        movq    2*8(%rsp),%r13
+        CFI_RESTORE r13
+        addq    $3*8,%rsp
+        CFI_ADJUST_CFA_OFFSET -3*8
+        ret
+.Lcopy_page_end:
+        CFI_ENDPROC
+ENDPROC(copy_page)
+        /* Some CPUs run faster using the string copy instructions.
+           It is also a lot simpler. Use this when possible */
+#include <asm/cpufeature.h>
+        .section .altinstr_replacement,"ax"
+1:      .byte 0xeb                                      /* jmp <disp8> */
+        .byte (copy_page_c - copy_page) - (2f - 1b)     /* offset */
+2:
+        .previous
+        .section .altinstructions,"a"
+        .align 8
+        .quad copy_page
+        .quad 1b
+        .byte X86_FEATURE_REP_GOOD
+        .byte .Lcopy_page_end - copy_page
+        .byte 2b - 1b
+        .previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
new file mode 100644
index 000000000000..70bebd310408
--- /dev/null
+++ b/arch/x86/lib/copy_user_64.S
@@ -0,0 +1,354 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ * 
+ * Functions to copy from and to user space.            
+ */              
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#define FIX_ALIGNMENT 1
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+        .macro ALTERNATIVE_JUMP feature,orig,alt
+0:
+        .byte 0xe9      /* 32bit jump */
+        .long \orig-1f  /* by default jump to orig */
+1:
+        .section .altinstr_replacement,"ax"
+2:      .byte 0xe9                   /* near jump with 32bit immediate */
+        .long \alt-1b /* offset */   /* or alternatively to alt */
+        .previous
+        .section .altinstructions,"a"
+        .align 8
+        .quad  0b
+        .quad  2b
+        .byte  \feature              /* when feature is set */
+        .byte  5
+        .byte  5
+        .previous
+        .endm
+/* Standard copy_to_user with segment limit checking */         
+ENTRY(copy_to_user)
+        CFI_STARTPROC
+        GET_THREAD_INFO(%rax)
+        movq %rdi,%rcx
+        addq %rdx,%rcx
+        jc  bad_to_user
+        cmpq threadinfo_addr_limit(%rax),%rcx
+        jae bad_to_user
+        xorl %eax,%eax  /* clear zero flag */
+        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+        CFI_ENDPROC
+ENTRY(copy_user_generic)
+        CFI_STARTPROC
+        movl $1,%ecx    /* set zero flag */
+        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+        CFI_ENDPROC
+ENTRY(__copy_from_user_inatomic)
+        CFI_STARTPROC
+        xorl %ecx,%ecx  /* clear zero flag */
+        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+        CFI_ENDPROC
+/* Standard copy_from_user with segment limit checking */       
+ENTRY(copy_from_user)
+        CFI_STARTPROC
+        GET_THREAD_INFO(%rax)
+        movq %rsi,%rcx
+        addq %rdx,%rcx
+        jc  bad_from_user
+        cmpq threadinfo_addr_limit(%rax),%rcx
+        jae  bad_from_user
+        movl $1,%ecx    /* set zero flag */
+        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+        CFI_ENDPROC
+ENDPROC(copy_from_user)
+        
+        .section .fixup,"ax"
+        /* must zero dest */
+bad_from_user:
+        CFI_STARTPROC
+        movl %edx,%ecx
+        xorl %eax,%eax
+        rep
+        stosb
+bad_to_user:
+        movl    %edx,%eax
+        ret
+        CFI_ENDPROC
+END(bad_from_user)
+        .previous
+        
+                
+/*
+ * copy_user_generic_unrolled - memory copy with exception handling.
+ * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
+ *      
+ * Input:       
+ * rdi destination
+ * rsi source
+ * rdx count
+ * ecx zero flag -- if true zero destination on error
+ *
+ * Output:              
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_generic_unrolled)
+        CFI_STARTPROC
+        pushq %rbx
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rbx, 0
+        pushq %rcx
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rcx, 0
+        xorl %eax,%eax          /*zero for the exception handler */
+#ifdef FIX_ALIGNMENT
+        /* check for bad alignment of destination */
+        movl %edi,%ecx
+        andl $7,%ecx
+        jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+        movq %rdx,%rcx
+        movl $64,%ebx
+        shrq $6,%rdx
+        decq %rdx
+        js   .Lhandle_tail
+        .p2align 4
+.Lloop:
+.Ls1:   movq (%rsi),%r11
+.Ls2:   movq 1*8(%rsi),%r8
+.Ls3:   movq 2*8(%rsi),%r9
+.Ls4:   movq 3*8(%rsi),%r10
+.Ld1:   movq %r11,(%rdi)
+.Ld2:   movq %r8,1*8(%rdi)
+.Ld3:   movq %r9,2*8(%rdi)
+.Ld4:   movq %r10,3*8(%rdi)
+.Ls5:   movq 4*8(%rsi),%r11
+.Ls6:   movq 5*8(%rsi),%r8
+.Ls7:   movq 6*8(%rsi),%r9
+.Ls8:   movq 7*8(%rsi),%r10
+.Ld5:   movq %r11,4*8(%rdi)
+.Ld6:   movq %r8,5*8(%rdi)
+.Ld7:   movq %r9,6*8(%rdi)
+.Ld8:   movq %r10,7*8(%rdi)
+        decq %rdx
+        leaq 64(%rsi),%rsi
+        leaq 64(%rdi),%rdi
+        jns  .Lloop
+        .p2align 4
+.Lhandle_tail:
+        movl %ecx,%edx
+        andl $63,%ecx
+        shrl $3,%ecx
+        jz   .Lhandle_7
+        movl $8,%ebx
+        .p2align 4
+.Lloop_8:
+.Ls9:   movq (%rsi),%r8
+.Ld9:   movq %r8,(%rdi)
+        decl %ecx
+        leaq 8(%rdi),%rdi
+        leaq 8(%rsi),%rsi
+        jnz .Lloop_8
+.Lhandle_7:
+        movl %edx,%ecx
+        andl $7,%ecx
+        jz   .Lende
+        .p2align 4
+.Lloop_1:
+.Ls10:  movb (%rsi),%bl
+.Ld10:  movb %bl,(%rdi)
+        incq %rdi
+        incq %rsi
+        decl %ecx
+        jnz .Lloop_1
+        CFI_REMEMBER_STATE
+.Lende:
+        popq %rcx
+        CFI_ADJUST_CFA_OFFSET -8
+        CFI_RESTORE rcx
+        popq %rbx
+        CFI_ADJUST_CFA_OFFSET -8
+        CFI_RESTORE rbx
+        ret
+        CFI_RESTORE_STATE
+#ifdef FIX_ALIGNMENT
+        /* align destination */
+        .p2align 4
+.Lbad_alignment:
+        movl $8,%r9d
+        subl %ecx,%r9d
+        movl %r9d,%ecx
+        cmpq %r9,%rdx
+        jz   .Lhandle_7
+        js   .Lhandle_7
+.Lalign_1:
+.Ls11:  movb (%rsi),%bl
+.Ld11:  movb %bl,(%rdi)
+        incq %rsi
+        incq %rdi
+        decl %ecx
+        jnz .Lalign_1
+        subq %r9,%rdx
+        jmp .Lafter_bad_alignment
+#endif
+        /* table sorted by exception address */
+        .section __ex_table,"a"
+        .align 8
+        .quad .Ls1,.Ls1e
+        .quad .Ls2,.Ls2e
+        .quad .Ls3,.Ls3e
+        .quad .Ls4,.Ls4e
+        .quad .Ld1,.Ls1e
+        .quad .Ld2,.Ls2e
+        .quad .Ld3,.Ls3e
+        .quad .Ld4,.Ls4e
+        .quad .Ls5,.Ls5e
+        .quad .Ls6,.Ls6e
+        .quad .Ls7,.Ls7e
+        .quad .Ls8,.Ls8e
+        .quad .Ld5,.Ls5e
+        .quad .Ld6,.Ls6e
+        .quad .Ld7,.Ls7e
+        .quad .Ld8,.Ls8e
+        .quad .Ls9,.Le_quad
+        .quad .Ld9,.Le_quad
+        .quad .Ls10,.Le_byte
+        .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+        .quad .Ls11,.Lzero_rest
+        .quad .Ld11,.Lzero_rest
+#endif
+        .quad .Le5,.Le_zero
+        .previous
+        /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+           pessimistic side. this is gross. it would be better to fix the
+           interface. */
+        /* eax: zero, ebx: 64 */
+.Ls1e:  addl $8,%eax
+.Ls2e:  addl $8,%eax
+.Ls3e:  addl $8,%eax
+.Ls4e:  addl $8,%eax
+.Ls5e:  addl $8,%eax
+.Ls6e:  addl $8,%eax
+.Ls7e:  addl $8,%eax
+.Ls8e:  addl $8,%eax
+        addq %rbx,%rdi  /* +64 */
+        subq %rax,%rdi  /* correct destination with computed offset */
+        shlq $6,%rdx    /* loop counter * 64 (stride length) */
+        addq %rax,%rdx  /* add offset to loopcnt */
+        andl $63,%ecx   /* remaining bytes */
+        addq %rcx,%rdx  /* add them */
+        jmp .Lzero_rest
+        /* exception on quad word loop in tail handling */
+        /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+        shll $3,%ecx
+        andl $7,%edx
+        addl %ecx,%edx
+        /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+        cmpl $0,(%rsp)
+        jz   .Le_zero
+        movq %rdx,%rcx
+.Le_byte:
+        xorl %eax,%eax
+.Le5:   rep
+        stosb
+        /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+        movq %rdx,%rax
+        jmp .Lende
+        CFI_ENDPROC
+ENDPROC(copy_user_generic)
+        /* Some CPUs run faster using the string copy instructions.
+           This is also a lot simpler. Use them when possible.
+           Patch in jmps to this code instead of copying it fully
+           to avoid unwanted aliasing in the exception tables. */
+ /* rdi destination
+  * rsi source
+  * rdx count
+  * ecx zero flag
+  *
+  * Output:
+  * eax uncopied bytes or 0 if successfull.
+  *
+  * Only 4GB of copy is supported. This shouldn't be a problem
+  * because the kernel normally only writes from/to page sized chunks
+  * even if user space passed a longer buffer.
+  * And more would be dangerous because both Intel and AMD have
+  * errata with rep movsq > 4GB. If someone feels the need to fix
+  * this please consider this.
+  */
+ENTRY(copy_user_generic_string)
+        CFI_STARTPROC
+        movl %ecx,%r8d          /* save zero flag */
+        movl %edx,%ecx
+        shrl $3,%ecx
+        andl $7,%edx    
+        jz   10f
+1:      rep 
+        movsq 
+        movl %edx,%ecx
+2:      rep
+        movsb
+9:      movl %ecx,%eax
+        ret
+        /* multiple of 8 byte */
+10:     rep
+        movsq
+        xor %eax,%eax
+        ret
+        /* exception handling */
+3:      lea (%rdx,%rcx,8),%rax  /* exception on quad loop */
+        jmp 6f
+5:      movl %ecx,%eax          /* exception on byte loop */
+        /* eax: left over bytes */
+6:      testl %r8d,%r8d         /* zero flag set? */
+        jz 7f
+        movl %eax,%ecx          /* initialize x86 loop counter */
+        push %rax
+        xorl %eax,%eax
+8:      rep
+        stosb                   /* zero the rest */
+11:     pop %rax
+7:      ret
+        CFI_ENDPROC
+END(copy_user_generic_c)
+        .section __ex_table,"a"
+        .quad 1b,3b
+        .quad 2b,5b
+        .quad 8b,11b
+        .quad 10b,3b
+        .previous
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
new file mode 100644
index 000000000000..4620efb12f13
--- /dev/null
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -0,0 +1,217 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#define FIX_ALIGNMENT 1
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * rcx zero flag        when 1 zero on exception
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(__copy_user_nocache)
+        CFI_STARTPROC
+        pushq %rbx
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rbx, 0
+        pushq %rcx              /* save zero flag */
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rcx, 0
+        xorl %eax,%eax          /* zero for the exception handler */
+#ifdef FIX_ALIGNMENT
+        /* check for bad alignment of destination */
+        movl %edi,%ecx
+        andl $7,%ecx
+        jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+        movq %rdx,%rcx
+        movl $64,%ebx
+        shrq $6,%rdx
+        decq %rdx
+        js   .Lhandle_tail
+        .p2align 4
+.Lloop:
+.Ls1:   movq (%rsi),%r11
+.Ls2:   movq 1*8(%rsi),%r8
+.Ls3:   movq 2*8(%rsi),%r9
+.Ls4:   movq 3*8(%rsi),%r10
+.Ld1:   movnti %r11,(%rdi)
+.Ld2:   movnti %r8,1*8(%rdi)
+.Ld3:   movnti %r9,2*8(%rdi)
+.Ld4:   movnti %r10,3*8(%rdi)
+.Ls5:   movq 4*8(%rsi),%r11
+.Ls6:   movq 5*8(%rsi),%r8
+.Ls7:   movq 6*8(%rsi),%r9
+.Ls8:   movq 7*8(%rsi),%r10
+.Ld5:   movnti %r11,4*8(%rdi)
+.Ld6:   movnti %r8,5*8(%rdi)
+.Ld7:   movnti %r9,6*8(%rdi)
+.Ld8:   movnti %r10,7*8(%rdi)
+        dec  %rdx
+        leaq 64(%rsi),%rsi
+        leaq 64(%rdi),%rdi
+        jns  .Lloop
+        .p2align 4
+.Lhandle_tail:
+        movl %ecx,%edx
+        andl $63,%ecx
+        shrl $3,%ecx
+        jz   .Lhandle_7
+        movl $8,%ebx
+        .p2align 4
+.Lloop_8:
+.Ls9:   movq (%rsi),%r8
+.Ld9:   movnti %r8,(%rdi)
+        decl %ecx
+        leaq 8(%rdi),%rdi
+        leaq 8(%rsi),%rsi
+        jnz .Lloop_8
+.Lhandle_7:
+        movl %edx,%ecx
+        andl $7,%ecx
+        jz   .Lende
+        .p2align 4
+.Lloop_1:
+.Ls10:  movb (%rsi),%bl
+.Ld10:  movb %bl,(%rdi)
+        incq %rdi
+        incq %rsi
+        decl %ecx
+        jnz .Lloop_1
+        CFI_REMEMBER_STATE
+.Lende:
+        popq %rcx
+        CFI_ADJUST_CFA_OFFSET -8
+        CFI_RESTORE %rcx
+        popq %rbx
+        CFI_ADJUST_CFA_OFFSET -8
+        CFI_RESTORE rbx
+        ret
+        CFI_RESTORE_STATE
+#ifdef FIX_ALIGNMENT
+        /* align destination */
+        .p2align 4
+.Lbad_alignment:
+        movl $8,%r9d
+        subl %ecx,%r9d
+        movl %r9d,%ecx
+        cmpq %r9,%rdx
+        jz   .Lhandle_7
+        js   .Lhandle_7
+.Lalign_1:
+.Ls11:  movb (%rsi),%bl
+.Ld11:  movb %bl,(%rdi)
+        incq %rsi
+        incq %rdi
+        decl %ecx
+        jnz .Lalign_1
+        subq %r9,%rdx
+        jmp .Lafter_bad_alignment
+#endif
+        /* table sorted by exception address */
+        .section __ex_table,"a"
+        .align 8
+        .quad .Ls1,.Ls1e
+        .quad .Ls2,.Ls2e
+        .quad .Ls3,.Ls3e
+        .quad .Ls4,.Ls4e
+        .quad .Ld1,.Ls1e
+        .quad .Ld2,.Ls2e
+        .quad .Ld3,.Ls3e
+        .quad .Ld4,.Ls4e
+        .quad .Ls5,.Ls5e
+        .quad .Ls6,.Ls6e
+        .quad .Ls7,.Ls7e
+        .quad .Ls8,.Ls8e
+        .quad .Ld5,.Ls5e
+        .quad .Ld6,.Ls6e
+        .quad .Ld7,.Ls7e
+        .quad .Ld8,.Ls8e
+        .quad .Ls9,.Le_quad
+        .quad .Ld9,.Le_quad
+        .quad .Ls10,.Le_byte
+        .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+        .quad .Ls11,.Lzero_rest
+        .quad .Ld11,.Lzero_rest
+#endif
+        .quad .Le5,.Le_zero
+        .previous
+        /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+           pessimistic side. this is gross. it would be better to fix the
+           interface. */
+        /* eax: zero, ebx: 64 */
+.Ls1e:  addl $8,%eax
+.Ls2e:  addl $8,%eax
+.Ls3e:  addl $8,%eax
+.Ls4e:  addl $8,%eax
+.Ls5e:  addl $8,%eax
+.Ls6e:  addl $8,%eax
+.Ls7e:  addl $8,%eax
+.Ls8e:  addl $8,%eax
+        addq %rbx,%rdi  /* +64 */
+        subq %rax,%rdi  /* correct destination with computed offset */
+        shlq $6,%rdx    /* loop counter * 64 (stride length) */
+        addq %rax,%rdx  /* add offset to loopcnt */
+        andl $63,%ecx   /* remaining bytes */
+        addq %rcx,%rdx  /* add them */
+        jmp .Lzero_rest
+        /* exception on quad word loop in tail handling */
+        /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+        shll $3,%ecx
+        andl $7,%edx
+        addl %ecx,%edx
+        /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+        cmpl $0,(%rsp)  /* zero flag set? */
+        jz   .Le_zero
+        movq %rdx,%rcx
+.Le_byte:
+        xorl %eax,%eax
+.Le5:   rep
+        stosb
+        /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+        movq %rdx,%rax
+        jmp .Lende
+        CFI_ENDPROC
+ENDPROC(__copy_user_nocache)
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
new file mode 100644
index 000000000000..f0dba36578ea
--- /dev/null
+++ b/arch/x86/lib/csum-copy_64.S
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ *      
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details. No warranty for anything given at all.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/errno.h>
+/*
+ * Checksum copy with exception handling.
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 
+ * destination is zeroed.
+ * 
+ * Input
+ * rdi  source
+ * rsi  destination
+ * edx  len (32bit)
+ * ecx  sum (32bit) 
+ * r8   src_err_ptr (int)
+ * r9   dst_err_ptr (int)
+ *
+ * Output
+ * eax  64bit sum. undefined in case of exception.
+ * 
+ * Wrappers need to take care of valid exception sum and zeroing.                
+ * They also should align source or destination to 8 bytes.
+ */
+        .macro source
+10:
+        .section __ex_table,"a"
+        .align 8
+        .quad 10b,.Lbad_source
+        .previous
+        .endm
+                
+        .macro dest
+20:
+        .section __ex_table,"a"
+        .align 8
+        .quad 20b,.Lbad_dest
+        .previous
+        .endm
+                        
+        .macro ignore L=.Lignore
+30:
+        .section __ex_table,"a"
+        .align 8
+        .quad 30b,\L
+        .previous
+        .endm
+        
+                                
+ENTRY(csum_partial_copy_generic)
+        CFI_STARTPROC
+        cmpl     $3*64,%edx
+        jle      .Lignore
+.Lignore:               
+        subq  $7*8,%rsp
+        CFI_ADJUST_CFA_OFFSET 7*8
+        movq  %rbx,2*8(%rsp)
+        CFI_REL_OFFSET rbx, 2*8
+        movq  %r12,3*8(%rsp)
+        CFI_REL_OFFSET r12, 3*8
+        movq  %r14,4*8(%rsp)
+        CFI_REL_OFFSET r14, 4*8
+        movq  %r13,5*8(%rsp)
+        CFI_REL_OFFSET r13, 5*8
+        movq  %rbp,6*8(%rsp)
+        CFI_REL_OFFSET rbp, 6*8
+        movq  %r8,(%rsp)
+        movq  %r9,1*8(%rsp)
+        
+        movl  %ecx,%eax
+        movl  %edx,%ecx
+        xorl  %r9d,%r9d
+        movq  %rcx,%r12
+        shrq  $6,%r12
+        jz    .Lhandle_tail       /* < 64 */
+        clc
+        
+        /* main loop. clear in 64 byte blocks */
+        /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
+        /* r11: temp3, rdx: temp4, r12 loopcnt */
+        /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
+        .p2align 4
+.Lloop:
+        source
+        movq  (%rdi),%rbx
+        source
+        movq  8(%rdi),%r8
+        source
+        movq  16(%rdi),%r11
+        source
+        movq  24(%rdi),%rdx
+        source
+        movq  32(%rdi),%r10
+        source
+        movq  40(%rdi),%rbp
+        source
+        movq  48(%rdi),%r14
+        source
+        movq  56(%rdi),%r13
+                
+        ignore 2f
+        prefetcht0 5*64(%rdi)
+2:                                                      
+        adcq  %rbx,%rax
+        adcq  %r8,%rax
+        adcq  %r11,%rax
+        adcq  %rdx,%rax
+        adcq  %r10,%rax
+        adcq  %rbp,%rax
+        adcq  %r14,%rax
+        adcq  %r13,%rax
+        decl %r12d
+        
+        dest
+        movq %rbx,(%rsi)
+        dest
+        movq %r8,8(%rsi)
+        dest
+        movq %r11,16(%rsi)
+        dest
+        movq %rdx,24(%rsi)
+        dest
+        movq %r10,32(%rsi)
+        dest
+        movq %rbp,40(%rsi)
+        dest
+        movq %r14,48(%rsi)
+        dest
+        movq %r13,56(%rsi)
+        
+3:
+        
+        leaq 64(%rdi),%rdi
+        leaq 64(%rsi),%rsi
+        jnz   .Lloop
+        adcq  %r9,%rax
+        /* do last upto 56 bytes */
+.Lhandle_tail:
+        /* ecx: count */
+        movl %ecx,%r10d
+        andl $63,%ecx
+        shrl $3,%ecx
+        jz       .Lfold
+        clc
+        .p2align 4
+.Lloop_8:       
+        source
+        movq (%rdi),%rbx
+        adcq %rbx,%rax
+        decl %ecx
+        dest
+        movq %rbx,(%rsi)
+        leaq 8(%rsi),%rsi /* preserve carry */
+        leaq 8(%rdi),%rdi
+        jnz     .Lloop_8
+        adcq %r9,%rax   /* add in carry */
+.Lfold:
+        /* reduce checksum to 32bits */
+        movl %eax,%ebx
+        shrq $32,%rax
+        addl %ebx,%eax
+        adcl %r9d,%eax
+        /* do last upto 6 bytes */      
+.Lhandle_7:
+        movl %r10d,%ecx
+        andl $7,%ecx
+        shrl $1,%ecx
+        jz   .Lhandle_1
+        movl $2,%edx
+        xorl %ebx,%ebx
+        clc  
+        .p2align 4
+.Lloop_1:       
+        source
+        movw (%rdi),%bx
+        adcl %ebx,%eax
+        decl %ecx
+        dest
+        movw %bx,(%rsi)
+        leaq 2(%rdi),%rdi
+        leaq 2(%rsi),%rsi
+        jnz .Lloop_1
+        adcl %r9d,%eax  /* add in carry */
+        
+        /* handle last odd byte */
+.Lhandle_1:
+        testl $1,%r10d
+        jz    .Lende
+        xorl  %ebx,%ebx
+        source
+        movb (%rdi),%bl
+        dest
+        movb %bl,(%rsi)
+        addl %ebx,%eax
+        adcl %r9d,%eax          /* carry */
+                        
+        CFI_REMEMBER_STATE
+.Lende:
+        movq 2*8(%rsp),%rbx
+        CFI_RESTORE rbx
+        movq 3*8(%rsp),%r12
+        CFI_RESTORE r12
+        movq 4*8(%rsp),%r14
+        CFI_RESTORE r14
+        movq 5*8(%rsp),%r13
+        CFI_RESTORE r13
+        movq 6*8(%rsp),%rbp
+        CFI_RESTORE rbp
+        addq $7*8,%rsp
+        CFI_ADJUST_CFA_OFFSET -7*8
+        ret
+        CFI_RESTORE_STATE
+        /* Exception handlers. Very simple, zeroing is done in the wrappers */
+.Lbad_source:
+        movq (%rsp),%rax
+        testq %rax,%rax
+        jz   .Lende
+        movl $-EFAULT,(%rax)
+        jmp  .Lende
+        
+.Lbad_dest:
+        movq 8(%rsp),%rax
+        testq %rax,%rax
+        jz   .Lende     
+        movl $-EFAULT,(%rax)
+        jmp .Lende
+        CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
new file mode 100644
index 000000000000..bc503f506903
--- /dev/null
+++ b/arch/x86/lib/csum-partial_64.c
@@ -0,0 +1,150 @@
+/*
+ * arch/x86_64/lib/csum-partial.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed.
+ */
+ 
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <asm/checksum.h>
+static inline unsigned short from32to16(unsigned a) 
+{
+        unsigned short b = a >> 16; 
+        asm("addw %w2,%w0\n\t"
+            "adcw $0,%w0\n" 
+            : "=r" (b)
+            : "0" (b), "r" (a));
+        return b;
+}
+/*
+ * Do a 64-bit checksum on an arbitrary memory area.
+ * Returns a 32bit checksum.
+ *
+ * This isn't as time critical as it used to be because many NICs
+ * do hardware checksumming these days.
+ * 
+ * Things tried and found to not make it faster:
+ * Manual Prefetching
+ * Unrolling to an 128 bytes inner loop.
+ * Using interleaving with more registers to break the carry chains.
+ */
+static unsigned do_csum(const unsigned char *buff, unsigned len)
+{
+        unsigned odd, count;
+        unsigned long result = 0;
+        if (unlikely(len == 0))
+                return result; 
+        odd = 1 & (unsigned long) buff;
+        if (unlikely(odd)) {
+                result = *buff << 8;
+                len--;
+                buff++;
+        }
+        count = len >> 1;               /* nr of 16-bit words.. */
+        if (count) {
+                if (2 & (unsigned long) buff) {
+                        result += *(unsigned short *)buff;
+                        count--;
+                        len -= 2;
+                        buff += 2;
+                }
+                count >>= 1;            /* nr of 32-bit words.. */
+                if (count) {
+                        unsigned long zero;
+                        unsigned count64;
+                        if (4 & (unsigned long) buff) {
+                                result += *(unsigned int *) buff;
+                                count--;
+                                len -= 4;
+                                buff += 4;
+                        }
+                        count >>= 1;    /* nr of 64-bit words.. */
+                        /* main loop using 64byte blocks */
+                        zero = 0;
+                        count64 = count >> 3;
+                        while (count64) { 
+                                asm("addq 0*8(%[src]),%[res]\n\t"
+                                    "adcq 1*8(%[src]),%[res]\n\t"
+                                    "adcq 2*8(%[src]),%[res]\n\t"
+                                    "adcq 3*8(%[src]),%[res]\n\t"
+                                    "adcq 4*8(%[src]),%[res]\n\t"
+                                    "adcq 5*8(%[src]),%[res]\n\t"
+                                    "adcq 6*8(%[src]),%[res]\n\t"
+                                    "adcq 7*8(%[src]),%[res]\n\t"
+                                    "adcq %[zero],%[res]"
+                                    : [res] "=r" (result)
+                                    : [src] "r" (buff), [zero] "r" (zero),
+                                    "[res]" (result));
+                                buff += 64;
+                                count64--;
+                        }
+                        /* last upto 7 8byte blocks */
+                        count %= 8; 
+                        while (count) { 
+                                asm("addq %1,%0\n\t"
+                                    "adcq %2,%0\n" 
+                                            : "=r" (result)
+                                    : "m" (*(unsigned long *)buff), 
+                                    "r" (zero),  "0" (result));
+                                --count; 
+                                        buff += 8;
+                        }
+                        result = add32_with_carry(result>>32,
+                                                  result&0xffffffff); 
+                        if (len & 4) {
+                                result += *(unsigned int *) buff;
+                                buff += 4;
+                        }
+                }
+                if (len & 2) {
+                        result += *(unsigned short *) buff;
+                        buff += 2;
+                }
+        }
+        if (len & 1)
+                result += *buff;
+        result = add32_with_carry(result>>32, result & 0xffffffff); 
+        if (unlikely(odd)) { 
+                result = from32to16(result);
+                result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+        }
+        return result;
+}
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 64-bit boundary
+ */
+__wsum csum_partial(const void *buff, int len, __wsum sum)
+{
+        return (__force __wsum)add32_with_carry(do_csum(buff, len),
+                                                (__force u32)sum);
+}
+EXPORT_SYMBOL(csum_partial);
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+__sum16 ip_compute_csum(const void *buff, int len)
+{
+        return csum_fold(csum_partial(buff,len,0));
+}
+EXPORT_SYMBOL(ip_compute_csum);
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
new file mode 100644
index 000000000000..fd42a4a095fc
--- /dev/null
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -0,0 +1,135 @@
+/* Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v.2
+ * 
+ * Wrappers of assembly checksum functions for x86-64.
+ */
+#include <asm/checksum.h>
+#include <linux/module.h>
+/** 
+ * csum_partial_copy_from_user - Copy and checksum from user space. 
+ * @src: source address (user space) 
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad source address.
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits. 
+ */ 
+__wsum
+csum_partial_copy_from_user(const void __user *src, void *dst,
+                            int len, __wsum isum, int *errp)
+{ 
+        might_sleep();
+        *errp = 0;
+        if (likely(access_ok(VERIFY_READ,src, len))) { 
+                /* Why 6, not 7? To handle odd addresses aligned we
+                   would need to do considerable complications to fix the
+                   checksum which is defined as an 16bit accumulator. The
+                   fix alignment code is primarily for performance
+                   compatibility with 32bit and that will handle odd
+                   addresses slowly too. */
+                if (unlikely((unsigned long)src & 6)) {                 
+                        while (((unsigned long)src & 6) && len >= 2) { 
+                                __u16 val16;                    
+                                *errp = __get_user(val16, (const __u16 __user *)src);
+                                if (*errp)
+                                        return isum;
+                                *(__u16 *)dst = val16;
+                                isum = (__force __wsum)add32_with_carry(
+                                                (__force unsigned)isum, val16);
+                                src += 2; 
+                                dst += 2; 
+                                len -= 2;
+                        }
+                }
+                isum = csum_partial_copy_generic((__force const void *)src,
+                                        dst, len, isum, errp, NULL);
+                if (likely(*errp == 0)) 
+                        return isum;
+        } 
+        *errp = -EFAULT;
+        memset(dst,0,len); 
+        return isum;            
+} 
+EXPORT_SYMBOL(csum_partial_copy_from_user);
+/** 
+ * csum_partial_copy_to_user - Copy and checksum to user space. 
+ * @src: source address
+ * @dst: destination address (user space)
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad destination address.
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits.
+ */ 
+__wsum
+csum_partial_copy_to_user(const void *src, void __user *dst,
+                          int len, __wsum isum, int *errp)
+{ 
+        might_sleep();
+        if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
+                *errp = -EFAULT;
+                return 0; 
+        }
+        if (unlikely((unsigned long)dst & 6)) {
+                while (((unsigned long)dst & 6) && len >= 2) { 
+                        __u16 val16 = *(__u16 *)src;
+                        isum = (__force __wsum)add32_with_carry(
+                                        (__force unsigned)isum, val16);
+                        *errp = __put_user(val16, (__u16 __user *)dst);
+                        if (*errp)
+                                return isum;
+                        src += 2; 
+                        dst += 2; 
+                        len -= 2;
+                }
+        }
+        *errp = 0;
+        return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp); 
+} 
+EXPORT_SYMBOL(csum_partial_copy_to_user);
+/** 
+ * csum_partial_copy_nocheck - Copy and checksum.
+ * @src: source address
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ */ 
+__wsum
+csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
+{ 
+        return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL);
+} 
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+                        const struct in6_addr *daddr,
+                        __u32 len, unsigned short proto, __wsum sum)
+{
+        __u64 rest, sum64;
+     
+        rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
+                (__force __u64)sum;
+        asm("  addq (%[saddr]),%[sum]\n"
+            "  adcq 8(%[saddr]),%[sum]\n"
+            "  adcq (%[daddr]),%[sum]\n" 
+            "  adcq 8(%[daddr]),%[sum]\n"
+            "  adcq $0,%[sum]\n"
+            : [sum] "=r" (sum64) 
+            : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr));
+        return csum_fold((__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
+}
+EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c
new file mode 100644
index 000000000000..f6edb11364df
--- /dev/null
+++ b/arch/x86/lib/delay_32.c
@@ -0,0 +1,103 @@
+/*
+ *      Precise Delay Loops for i386
+ *
+ *      Copyright (C) 1993 Linus Torvalds
+ *      Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ *      The __delay function must _NOT_ be inlined as its execution time
+ *      depends wildly on alignment on many x86 processors. The additional
+ *      jump magic is needed to get the timing stable on all the CPU's
+ *      we have to worry about.
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <asm/processor.h>
+#include <asm/delay.h>
+#include <asm/timer.h>
+#ifdef CONFIG_SMP
+# include <asm/smp.h>
+#endif
+/* simple loop based delay: */
+static void delay_loop(unsigned long loops)
+{
+        int d0;
+        __asm__ __volatile__(
+                "\tjmp 1f\n"
+                ".align 16\n"
+                "1:\tjmp 2f\n"
+                ".align 16\n"
+                "2:\tdecl %0\n\tjns 2b"
+                :"=&a" (d0)
+                :"0" (loops));
+}
+/* TSC based delay: */
+static void delay_tsc(unsigned long loops)
+{
+        unsigned long bclock, now;
+        rdtscl(bclock);
+        do {
+                rep_nop();
+                rdtscl(now);
+        } while ((now-bclock) < loops);
+}
+/*
+ * Since we calibrate only once at boot, this
+ * function should be set once at boot and not changed
+ */
+static void (*delay_fn)(unsigned long) = delay_loop;
+void use_tsc_delay(void)
+{
+        delay_fn = delay_tsc;
+}
+int read_current_timer(unsigned long *timer_val)
+{
+        if (delay_fn == delay_tsc) {
+                rdtscl(*timer_val);
+                return 0;
+        }
+        return -1;
+}
+void __delay(unsigned long loops)
+{
+        delay_fn(loops);
+}
+inline void __const_udelay(unsigned long xloops)
+{
+        int d0;
+        xloops *= 4;
+        __asm__("mull %0"
+                :"=d" (xloops), "=&a" (d0)
+                :"1" (xloops), "0"
+                (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
+        __delay(++xloops);
+}
+void __udelay(unsigned long usecs)
+{
+        __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
+}
+void __ndelay(unsigned long nsecs)
+{
+        __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
+}
+EXPORT_SYMBOL(__delay);
+EXPORT_SYMBOL(__const_udelay);
+EXPORT_SYMBOL(__udelay);
+EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c
new file mode 100644
index 000000000000..2dbebd308347
--- /dev/null
+++ b/arch/x86/lib/delay_64.c
@@ -0,0 +1,57 @@
+/*
+ *      Precise Delay Loops for x86-64
+ *
+ *      Copyright (C) 1993 Linus Torvalds
+ *      Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ *      The __delay function must _NOT_ be inlined as its execution time
+ *      depends wildly on alignment on many x86 processors. 
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <asm/delay.h>
+#include <asm/msr.h>
+#ifdef CONFIG_SMP
+#include <asm/smp.h>
+#endif
+int read_current_timer(unsigned long *timer_value)
+{
+        rdtscll(*timer_value);
+        return 0;
+}
+void __delay(unsigned long loops)
+{
+        unsigned bclock, now;
+        
+        rdtscl(bclock);
+        do
+        {
+                rep_nop(); 
+                rdtscl(now);
+        }
+        while((now-bclock) < loops);
+}
+EXPORT_SYMBOL(__delay);
+inline void __const_udelay(unsigned long xloops)
+{
+        __delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1);
+}
+EXPORT_SYMBOL(__const_udelay);
+void __udelay(unsigned long usecs)
+{
+        __const_udelay(usecs * 0x000010c7);  /* 2**32 / 1000000 (rounded up) */
+}
+EXPORT_SYMBOL(__udelay);
+void __ndelay(unsigned long nsecs)
+{
+        __const_udelay(nsecs * 0x00005);  /* 2**32 / 1000000000 (rounded up) */
+}
+EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/getuser_32.S b/arch/x86/lib/getuser_32.S
new file mode 100644
index 000000000000..6d84b53f12a2
--- /dev/null
+++ b/arch/x86/lib/getuser_32.S
@@ -0,0 +1,78 @@
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/thread_info.h>
+/*
+ * __get_user_X
+ *
+ * Inputs:      %eax contains the address
+ *
+ * Outputs:     %eax is error code (0 or -EFAULT)
+ *              %edx contains zero-extended value
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+.text
+ENTRY(__get_user_1)
+        CFI_STARTPROC
+        GET_THREAD_INFO(%edx)
+        cmpl TI_addr_limit(%edx),%eax
+        jae bad_get_user
+1:      movzbl (%eax),%edx
+        xorl %eax,%eax
+        ret
+        CFI_ENDPROC
+ENDPROC(__get_user_1)
+ENTRY(__get_user_2)
+        CFI_STARTPROC
+        addl $1,%eax
+        jc bad_get_user
+        GET_THREAD_INFO(%edx)
+        cmpl TI_addr_limit(%edx),%eax
+        jae bad_get_user
+2:      movzwl -1(%eax),%edx
+        xorl %eax,%eax
+        ret
+        CFI_ENDPROC
+ENDPROC(__get_user_2)
+ENTRY(__get_user_4)
+        CFI_STARTPROC
+        addl $3,%eax
+        jc bad_get_user
+        GET_THREAD_INFO(%edx)
+        cmpl TI_addr_limit(%edx),%eax
+        jae bad_get_user
+3:      movl -3(%eax),%edx
+        xorl %eax,%eax
+        ret
+        CFI_ENDPROC
+ENDPROC(__get_user_4)
+bad_get_user:
+        CFI_STARTPROC
+        xorl %edx,%edx
+        movl $-14,%eax
+        ret
+        CFI_ENDPROC
+END(bad_get_user)
+.section __ex_table,"a"
+        .long 1b,bad_get_user
+        .long 2b,bad_get_user
+        .long 3b,bad_get_user
+.previous
diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser_64.S
new file mode 100644
index 000000000000..5448876261f8
--- /dev/null
+++ b/arch/x86/lib/getuser_64.S
@@ -0,0 +1,109 @@
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+/*
+ * __get_user_X
+ *
+ * Inputs:      %rcx contains the address.
+ *              The register is modified, but all changes are undone
+ *              before returning because the C code doesn't know about it.
+ *
+ * Outputs:     %rax is error code (0 or -EFAULT)
+ *              %rdx contains zero-extended value
+ * 
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+        .text
+ENTRY(__get_user_1)
+        CFI_STARTPROC
+        GET_THREAD_INFO(%r8)
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae bad_get_user
+1:      movzb (%rcx),%edx
+        xorl %eax,%eax
+        ret
+        CFI_ENDPROC
+ENDPROC(__get_user_1)
+ENTRY(__get_user_2)
+        CFI_STARTPROC
+        GET_THREAD_INFO(%r8)
+        addq $1,%rcx
+        jc 20f
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae 20f
+        decq   %rcx
+2:      movzwl (%rcx),%edx
+        xorl %eax,%eax
+        ret
+20:     decq    %rcx
+        jmp     bad_get_user
+        CFI_ENDPROC
+ENDPROC(__get_user_2)
+ENTRY(__get_user_4)
+        CFI_STARTPROC
+        GET_THREAD_INFO(%r8)
+        addq $3,%rcx
+        jc 30f
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae 30f
+        subq $3,%rcx
+3:      movl (%rcx),%edx
+        xorl %eax,%eax
+        ret
+30:     subq $3,%rcx
+        jmp bad_get_user
+        CFI_ENDPROC
+ENDPROC(__get_user_4)
+ENTRY(__get_user_8)
+        CFI_STARTPROC
+        GET_THREAD_INFO(%r8)
+        addq $7,%rcx
+        jc 40f
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae     40f
+        subq    $7,%rcx
+4:      movq (%rcx),%rdx
+        xorl %eax,%eax
+        ret
+40:     subq $7,%rcx
+        jmp bad_get_user
+        CFI_ENDPROC
+ENDPROC(__get_user_8)
+bad_get_user:
+        CFI_STARTPROC
+        xorl %edx,%edx
+        movq $(-EFAULT),%rax
+        ret
+        CFI_ENDPROC
+END(bad_get_user)
+.section __ex_table,"a"
+        .quad 1b,bad_get_user
+        .quad 2b,bad_get_user
+        .quad 3b,bad_get_user
+        .quad 4b,bad_get_user
+.previous
diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c
new file mode 100644
index 000000000000..87b4a4e18039
--- /dev/null
+++ b/arch/x86/lib/io_64.c
@@ -0,0 +1,23 @@
+#include <linux/string.h>
+#include <asm/io.h>
+#include <linux/module.h>
+void __memcpy_toio(unsigned long dst,const void*src,unsigned len)
+{
+        __inline_memcpy((void *) dst,src,len);
+}
+EXPORT_SYMBOL(__memcpy_toio);
+void __memcpy_fromio(void *dst,unsigned long src,unsigned len)
+{
+        __inline_memcpy(dst,(const void *) src,len);
+}
+EXPORT_SYMBOL(__memcpy_fromio);
+void memset_io(volatile void __iomem *a, int b, size_t c)
+{
+        /* XXX: memset can mangle the IO patterns quite a bit.
+           perhaps it would be better to use a dumb one */
+        memset((void *)a,b,c);
+}
+EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
new file mode 100644
index 000000000000..05a95e713da8
--- /dev/null
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2006 PathScale, Inc.  All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+/*
+ * override generic version in lib/iomap_copy.c
+ */
+ENTRY(__iowrite32_copy)
+        CFI_STARTPROC
+        movl %edx,%ecx
+        rep movsd
+        ret
+        CFI_ENDPROC
+ENDPROC(__iowrite32_copy)
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
new file mode 100644
index 000000000000..8ac51b82a632
--- /dev/null
+++ b/arch/x86/lib/memcpy_32.c
@@ -0,0 +1,43 @@
+#include <linux/string.h>
+#include <linux/module.h>
+#undef memcpy
+#undef memset
+void *memcpy(void *to, const void *from, size_t n)
+{
+#ifdef CONFIG_X86_USE_3DNOW
+        return __memcpy3d(to, from, n);
+#else
+        return __memcpy(to, from, n);
+#endif
+}
+EXPORT_SYMBOL(memcpy);
+void *memset(void *s, int c, size_t count)
+{
+        return __memset(s, c, count);
+}
+EXPORT_SYMBOL(memset);
+void *memmove(void *dest, const void *src, size_t n)
+{
+        int d0, d1, d2;
+        if (dest < src) {
+                memcpy(dest,src,n);
+        } else {
+                __asm__ __volatile__(
+                        "std\n\t"
+                        "rep\n\t"
+                        "movsb\n\t"
+                        "cld"
+                        : "=&c" (d0), "=&S" (d1), "=&D" (d2)
+                        :"0" (n),
+                         "1" (n-1+(const char *)src),
+                         "2" (n-1+(char *)dest)
+                        :"memory");
+        }
+        return dest;
+}
+EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
new file mode 100644
index 000000000000..c22981fa2f3a
--- /dev/null
+++ b/arch/x86/lib/memcpy_64.S
@@ -0,0 +1,131 @@
+/* Copyright 2002 Andi Kleen */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+/*
+ * memcpy - Copy a memory block.
+ *
+ * Input:       
+ * rdi destination
+ * rsi source
+ * rdx count
+ * 
+ * Output:
+ * rax original destination
+ */     
+        ALIGN
+memcpy_c:
+        CFI_STARTPROC
+        movq %rdi,%rax
+        movl %edx,%ecx
+        shrl $3,%ecx
+        andl $7,%edx
+        rep movsq
+        movl %edx,%ecx
+        rep movsb
+        ret
+        CFI_ENDPROC
+ENDPROC(memcpy_c)
+ENTRY(__memcpy)
+ENTRY(memcpy)
+        CFI_STARTPROC
+        pushq %rbx
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rbx, 0
+        movq %rdi,%rax
+        movl %edx,%ecx
+        shrl $6,%ecx
+        jz .Lhandle_tail
+        .p2align 4
+.Lloop_64:
+        decl %ecx
+        movq (%rsi),%r11
+        movq 8(%rsi),%r8
+        movq %r11,(%rdi)
+        movq %r8,1*8(%rdi)
+        movq 2*8(%rsi),%r9
+        movq 3*8(%rsi),%r10
+        movq %r9,2*8(%rdi)
+        movq %r10,3*8(%rdi)
+        movq 4*8(%rsi),%r11
+        movq 5*8(%rsi),%r8
+        movq %r11,4*8(%rdi)
+        movq %r8,5*8(%rdi)
+        movq 6*8(%rsi),%r9
+        movq 7*8(%rsi),%r10
+        movq %r9,6*8(%rdi)
+        movq %r10,7*8(%rdi)
+        leaq 64(%rsi),%rsi
+        leaq 64(%rdi),%rdi
+        jnz  .Lloop_64
+.Lhandle_tail:
+        movl %edx,%ecx
+        andl $63,%ecx
+        shrl $3,%ecx
+        jz   .Lhandle_7
+        .p2align 4
+.Lloop_8:
+        decl %ecx
+        movq (%rsi),%r8
+        movq %r8,(%rdi)
+        leaq 8(%rdi),%rdi
+        leaq 8(%rsi),%rsi
+        jnz  .Lloop_8
+.Lhandle_7:
+        movl %edx,%ecx
+        andl $7,%ecx
+        jz .Lende
+        .p2align 4
+.Lloop_1:
+        movb (%rsi),%r8b
+        movb %r8b,(%rdi)
+        incq %rdi
+        incq %rsi
+        decl %ecx
+        jnz .Lloop_1
+.Lende:
+        popq %rbx
+        CFI_ADJUST_CFA_OFFSET -8
+        CFI_RESTORE rbx
+        ret
+.Lfinal:
+        CFI_ENDPROC
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
+        /* Some CPUs run faster using the string copy instructions.
+           It is also a lot simpler. Use this when possible */
+        .section .altinstr_replacement,"ax"
+1:      .byte 0xeb                              /* jmp <disp8> */
+        .byte (memcpy_c - memcpy) - (2f - 1b)   /* offset */
+2:
+        .previous
+        .section .altinstructions,"a"
+        .align 8
+        .quad memcpy
+        .quad 1b
+        .byte X86_FEATURE_REP_GOOD
+        /* Replace only beginning, memcpy is used to apply alternatives, so it
+         * is silly to overwrite itself with nops - reboot is only outcome... */
+        .byte 2b - 1b
+        .byte 2b - 1b
+        .previous
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
new file mode 100644
index 000000000000..751ebae8ec42
--- /dev/null
+++ b/arch/x86/lib/memmove_64.c
@@ -0,0 +1,21 @@
+/* Normally compiler builtins are used, but sometimes the compiler calls out
+   of line code. Based on asm-i386/string.h.
+ */
+#define _STRING_C
+#include <linux/string.h>
+#include <linux/module.h>
+#undef memmove
+void *memmove(void * dest,const void *src,size_t count)
+{
+        if (dest < src) { 
+                return memcpy(dest,src,count);
+        } else {
+                char *p = (char *) dest + count;
+                char *s = (char *) src + count;
+                while (count--)
+                        *--p = *--s;
+        }
+        return dest;
+} 
+EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
new file mode 100644
index 000000000000..2c5948116bd2
--- /dev/null
+++ b/arch/x86/lib/memset_64.S
@@ -0,0 +1,133 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+/*
+ * ISO C memset - set a memory block to a byte value.
+ *      
+ * rdi   destination
+ * rsi   value (char) 
+ * rdx   count (bytes) 
+ * 
+ * rax   original destination
+ */     
+        ALIGN
+memset_c:
+        CFI_STARTPROC
+        movq %rdi,%r9
+        movl %edx,%r8d
+        andl $7,%r8d
+        movl %edx,%ecx
+        shrl $3,%ecx
+        /* expand byte value  */
+        movzbl %sil,%esi
+        movabs $0x0101010101010101,%rax
+        mulq %rsi               /* with rax, clobbers rdx */
+        rep stosq
+        movl %r8d,%ecx
+        rep stosb
+        movq %r9,%rax
+        ret
+        CFI_ENDPROC
+ENDPROC(memset_c)
+ENTRY(memset)
+ENTRY(__memset)
+        CFI_STARTPROC
+        movq %rdi,%r10
+        movq %rdx,%r11
+        /* expand byte value  */
+        movzbl %sil,%ecx
+        movabs $0x0101010101010101,%rax
+        mul    %rcx             /* with rax, clobbers rdx */
+        /* align dst */
+        movl  %edi,%r9d
+        andl  $7,%r9d
+        jnz  .Lbad_alignment
+        CFI_REMEMBER_STATE
+.Lafter_bad_alignment:
+        movl %r11d,%ecx
+        shrl $6,%ecx
+        jz       .Lhandle_tail
+        .p2align 4
+.Lloop_64:
+        decl   %ecx
+        movq  %rax,(%rdi)
+        movq  %rax,8(%rdi)
+        movq  %rax,16(%rdi)
+        movq  %rax,24(%rdi)
+        movq  %rax,32(%rdi)
+        movq  %rax,40(%rdi)
+        movq  %rax,48(%rdi)
+        movq  %rax,56(%rdi)
+        leaq  64(%rdi),%rdi
+        jnz    .Lloop_64
+        /* Handle tail in loops. The loops should be faster than hard
+           to predict jump tables. */
+        .p2align 4
+.Lhandle_tail:
+        movl    %r11d,%ecx
+        andl    $63&(~7),%ecx
+        jz              .Lhandle_7
+        shrl    $3,%ecx
+        .p2align 4
+.Lloop_8:
+        decl   %ecx
+        movq  %rax,(%rdi)
+        leaq  8(%rdi),%rdi
+        jnz    .Lloop_8
+.Lhandle_7:
+        movl    %r11d,%ecx
+        andl    $7,%ecx
+        jz      .Lende
+        .p2align 4
+.Lloop_1:
+        decl    %ecx
+        movb    %al,(%rdi)
+        leaq    1(%rdi),%rdi
+        jnz     .Lloop_1
+.Lende:
+        movq    %r10,%rax
+        ret
+        CFI_RESTORE_STATE
+.Lbad_alignment:
+        cmpq $7,%r11
+        jbe     .Lhandle_7
+        movq %rax,(%rdi)        /* unaligned store */
+        movq $8,%r8
+        subq %r9,%r8
+        addq %r8,%rdi
+        subq %r8,%r11
+        jmp .Lafter_bad_alignment
+.Lfinal:
+        CFI_ENDPROC
+ENDPROC(memset)
+ENDPROC(__memset)
+        /* Some CPUs run faster using the string instructions.
+           It is also a lot simpler. Use this when possible */
+#include <asm/cpufeature.h>
+        .section .altinstr_replacement,"ax"
+1:      .byte 0xeb                              /* jmp <disp8> */
+        .byte (memset_c - memset) - (2f - 1b)   /* offset */
+2:
+        .previous
+        .section .altinstructions,"a"
+        .align 8
+        .quad memset
+        .quad 1b
+        .byte X86_FEATURE_REP_GOOD
+        .byte .Lfinal - memset
+        .byte 2b - 1b
+        .previous
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
new file mode 100644
index 000000000000..28084d2e8dd4
--- /dev/null
+++ b/arch/x86/lib/mmx_32.c
@@ -0,0 +1,403 @@
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+#include <asm/i387.h>
+/*
+ *      MMX 3DNow! library helper functions
+ *
+ *      To do:
+ *      We can use MMX just for prefetch in IRQ's. This may be a win. 
+ *              (reported so on K6-III)
+ *      We should use a better code neutral filler for the short jump
+ *              leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
+ *      We also want to clobber the filler register so we don't get any
+ *              register forwarding stalls on the filler. 
+ *
+ *      Add *user handling. Checksums are not a win with MMX on any CPU
+ *      tested so far for any MMX solution figured.
+ *
+ *      22/09/2000 - Arjan van de Ven 
+ *              Improved for non-egineering-sample Athlons 
+ *
+ */
+ 
+void *_mmx_memcpy(void *to, const void *from, size_t len)
+{
+        void *p;
+        int i;
+        if (unlikely(in_interrupt()))
+                return __memcpy(to, from, len);
+        p = to;
+        i = len >> 6; /* len/64 */
+        kernel_fpu_begin();
+        __asm__ __volatile__ (
+                "1: prefetch (%0)\n"            /* This set is 28 bytes */
+                "   prefetch 64(%0)\n"
+                "   prefetch 128(%0)\n"
+                "   prefetch 192(%0)\n"
+                "   prefetch 256(%0)\n"
+                "2:  \n"
+                ".section .fixup, \"ax\"\n"
+                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
+                "   jmp 2b\n"
+                ".previous\n"
+                ".section __ex_table,\"a\"\n"
+                "       .align 4\n"
+                "       .long 1b, 3b\n"
+                ".previous"
+                : : "r" (from) );
+                
+        
+        for(; i>5; i--)
+        {
+                __asm__ __volatile__ (
+                "1:  prefetch 320(%0)\n"
+                "2:  movq (%0), %%mm0\n"
+                "  movq 8(%0), %%mm1\n"
+                "  movq 16(%0), %%mm2\n"
+                "  movq 24(%0), %%mm3\n"
+                "  movq %%mm0, (%1)\n"
+                "  movq %%mm1, 8(%1)\n"
+                "  movq %%mm2, 16(%1)\n"
+                "  movq %%mm3, 24(%1)\n"
+                "  movq 32(%0), %%mm0\n"
+                "  movq 40(%0), %%mm1\n"
+                "  movq 48(%0), %%mm2\n"
+                "  movq 56(%0), %%mm3\n"
+                "  movq %%mm0, 32(%1)\n"
+                "  movq %%mm1, 40(%1)\n"
+                "  movq %%mm2, 48(%1)\n"
+                "  movq %%mm3, 56(%1)\n"
+                ".section .fixup, \"ax\"\n"
+                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
+                "   jmp 2b\n"
+                ".previous\n"
+                ".section __ex_table,\"a\"\n"
+                "       .align 4\n"
+                "       .long 1b, 3b\n"
+                ".previous"
+                : : "r" (from), "r" (to) : "memory");
+                from+=64;
+                to+=64;
+        }
+        for(; i>0; i--)
+        {
+                __asm__ __volatile__ (
+                "  movq (%0), %%mm0\n"
+                "  movq 8(%0), %%mm1\n"
+                "  movq 16(%0), %%mm2\n"
+                "  movq 24(%0), %%mm3\n"
+                "  movq %%mm0, (%1)\n"
+                "  movq %%mm1, 8(%1)\n"
+                "  movq %%mm2, 16(%1)\n"
+                "  movq %%mm3, 24(%1)\n"
+                "  movq 32(%0), %%mm0\n"
+                "  movq 40(%0), %%mm1\n"
+                "  movq 48(%0), %%mm2\n"
+                "  movq 56(%0), %%mm3\n"
+                "  movq %%mm0, 32(%1)\n"
+                "  movq %%mm1, 40(%1)\n"
+                "  movq %%mm2, 48(%1)\n"
+                "  movq %%mm3, 56(%1)\n"
+                : : "r" (from), "r" (to) : "memory");
+                from+=64;
+                to+=64;
+        }
+        /*
+         *      Now do the tail of the block
+         */
+        __memcpy(to, from, len&63);
+        kernel_fpu_end();
+        return p;
+}
+#ifdef CONFIG_MK7
+/*
+ *      The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
+ *      other MMX using processors do not.
+ */
+static void fast_clear_page(void *page)
+{
+        int i;
+        kernel_fpu_begin();
+        
+        __asm__ __volatile__ (
+                "  pxor %%mm0, %%mm0\n" : :
+        );
+        for(i=0;i<4096/64;i++)
+        {
+                __asm__ __volatile__ (
+                "  movntq %%mm0, (%0)\n"
+                "  movntq %%mm0, 8(%0)\n"
+                "  movntq %%mm0, 16(%0)\n"
+                "  movntq %%mm0, 24(%0)\n"
+                "  movntq %%mm0, 32(%0)\n"
+                "  movntq %%mm0, 40(%0)\n"
+                "  movntq %%mm0, 48(%0)\n"
+                "  movntq %%mm0, 56(%0)\n"
+                : : "r" (page) : "memory");
+                page+=64;
+        }
+        /* since movntq is weakly-ordered, a "sfence" is needed to become
+         * ordered again.
+         */
+        __asm__ __volatile__ (
+                "  sfence \n" : :
+        );
+        kernel_fpu_end();
+}
+static void fast_copy_page(void *to, void *from)
+{
+        int i;
+        kernel_fpu_begin();
+        /* maybe the prefetch stuff can go before the expensive fnsave...
+         * but that is for later. -AV
+         */
+        __asm__ __volatile__ (
+                "1: prefetch (%0)\n"
+                "   prefetch 64(%0)\n"
+                "   prefetch 128(%0)\n"
+                "   prefetch 192(%0)\n"
+                "   prefetch 256(%0)\n"
+                "2:  \n"
+                ".section .fixup, \"ax\"\n"
+                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
+                "   jmp 2b\n"
+                ".previous\n"
+                ".section __ex_table,\"a\"\n"
+                "       .align 4\n"
+                "       .long 1b, 3b\n"
+                ".previous"
+                : : "r" (from) );
+        for(i=0; i<(4096-320)/64; i++)
+        {
+                __asm__ __volatile__ (
+                "1: prefetch 320(%0)\n"
+                "2: movq (%0), %%mm0\n"
+                "   movntq %%mm0, (%1)\n"
+                "   movq 8(%0), %%mm1\n"
+                "   movntq %%mm1, 8(%1)\n"
+                "   movq 16(%0), %%mm2\n"
+                "   movntq %%mm2, 16(%1)\n"
+                "   movq 24(%0), %%mm3\n"
+                "   movntq %%mm3, 24(%1)\n"
+                "   movq 32(%0), %%mm4\n"
+                "   movntq %%mm4, 32(%1)\n"
+                "   movq 40(%0), %%mm5\n"
+                "   movntq %%mm5, 40(%1)\n"
+                "   movq 48(%0), %%mm6\n"
+                "   movntq %%mm6, 48(%1)\n"
+                "   movq 56(%0), %%mm7\n"
+                "   movntq %%mm7, 56(%1)\n"
+                ".section .fixup, \"ax\"\n"
+                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
+                "   jmp 2b\n"
+                ".previous\n"
+                ".section __ex_table,\"a\"\n"
+                "       .align 4\n"
+                "       .long 1b, 3b\n"
+                ".previous"
+                : : "r" (from), "r" (to) : "memory");
+                from+=64;
+                to+=64;
+        }
+        for(i=(4096-320)/64; i<4096/64; i++)
+        {
+                __asm__ __volatile__ (
+                "2: movq (%0), %%mm0\n"
+                "   movntq %%mm0, (%1)\n"
+                "   movq 8(%0), %%mm1\n"
+                "   movntq %%mm1, 8(%1)\n"
+                "   movq 16(%0), %%mm2\n"
+                "   movntq %%mm2, 16(%1)\n"
+                "   movq 24(%0), %%mm3\n"
+                "   movntq %%mm3, 24(%1)\n"
+                "   movq 32(%0), %%mm4\n"
+                "   movntq %%mm4, 32(%1)\n"
+                "   movq 40(%0), %%mm5\n"
+                "   movntq %%mm5, 40(%1)\n"
+                "   movq 48(%0), %%mm6\n"
+                "   movntq %%mm6, 48(%1)\n"
+                "   movq 56(%0), %%mm7\n"
+                "   movntq %%mm7, 56(%1)\n"
+                : : "r" (from), "r" (to) : "memory");
+                from+=64;
+                to+=64;
+        }
+        /* since movntq is weakly-ordered, a "sfence" is needed to become
+         * ordered again.
+         */
+        __asm__ __volatile__ (
+                "  sfence \n" : :
+        );
+        kernel_fpu_end();
+}
+#else
+/*
+ *      Generic MMX implementation without K7 specific streaming
+ */
+ 
+static void fast_clear_page(void *page)
+{
+        int i;
+        
+        kernel_fpu_begin();
+        
+        __asm__ __volatile__ (
+                "  pxor %%mm0, %%mm0\n" : :
+        );
+        for(i=0;i<4096/128;i++)
+        {
+                __asm__ __volatile__ (
+                "  movq %%mm0, (%0)\n"
+                "  movq %%mm0, 8(%0)\n"
+                "  movq %%mm0, 16(%0)\n"
+                "  movq %%mm0, 24(%0)\n"
+                "  movq %%mm0, 32(%0)\n"
+                "  movq %%mm0, 40(%0)\n"
+                "  movq %%mm0, 48(%0)\n"
+                "  movq %%mm0, 56(%0)\n"
+                "  movq %%mm0, 64(%0)\n"
+                "  movq %%mm0, 72(%0)\n"
+                "  movq %%mm0, 80(%0)\n"
+                "  movq %%mm0, 88(%0)\n"
+                "  movq %%mm0, 96(%0)\n"
+                "  movq %%mm0, 104(%0)\n"
+                "  movq %%mm0, 112(%0)\n"
+                "  movq %%mm0, 120(%0)\n"
+                : : "r" (page) : "memory");
+                page+=128;
+        }
+        kernel_fpu_end();
+}
+static void fast_copy_page(void *to, void *from)
+{
+        int i;
+        
+        
+        kernel_fpu_begin();
+        __asm__ __volatile__ (
+                "1: prefetch (%0)\n"
+                "   prefetch 64(%0)\n"
+                "   prefetch 128(%0)\n"
+                "   prefetch 192(%0)\n"
+                "   prefetch 256(%0)\n"
+                "2:  \n"
+                ".section .fixup, \"ax\"\n"
+                "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
+                "   jmp 2b\n"
+                ".previous\n"
+                ".section __ex_table,\"a\"\n"
+                "       .align 4\n"
+                "       .long 1b, 3b\n"
+                ".previous"
+                : : "r" (from) );
+        for(i=0; i<4096/64; i++)
+        {
+                __asm__ __volatile__ (
+                "1: prefetch 320(%0)\n"
+                "2: movq (%0), %%mm0\n"
+                "   movq 8(%0), %%mm1\n"
+                "   movq 16(%0), %%mm2\n"
+                "   movq 24(%0), %%mm3\n"
+                "   movq %%mm0, (%1)\n"
+                "   movq %%mm1, 8(%1)\n"
+                "   movq %%mm2, 16(%1)\n"
+                "   movq %%mm3, 24(%1)\n"
+                "   movq 32(%0), %%mm0\n"
+                "   movq 40(%0), %%mm1\n"
+                "   movq 48(%0), %%mm2\n"
+                "   movq 56(%0), %%mm3\n"
+                "   movq %%mm0, 32(%1)\n"
+                "   movq %%mm1, 40(%1)\n"
+                "   movq %%mm2, 48(%1)\n"
+                "   movq %%mm3, 56(%1)\n"
+                ".section .fixup, \"ax\"\n"
+                "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
+                "   jmp 2b\n"
+                ".previous\n"
+                ".section __ex_table,\"a\"\n"
+                "       .align 4\n"
+                "       .long 1b, 3b\n"
+                ".previous"
+                : : "r" (from), "r" (to) : "memory");
+                from+=64;
+                to+=64;
+        }
+        kernel_fpu_end();
+}
+#endif
+/*
+ *      Favour MMX for page clear and copy. 
+ */
+static void slow_zero_page(void * page)
+{
+        int d0, d1;
+        __asm__ __volatile__( \
+                "cld\n\t" \
+                "rep ; stosl" \
+                : "=&c" (d0), "=&D" (d1)
+                :"a" (0),"1" (page),"0" (1024)
+                :"memory");
+}
+ 
+void mmx_clear_page(void * page)
+{
+        if(unlikely(in_interrupt()))
+                slow_zero_page(page);
+        else
+                fast_clear_page(page);
+}
+static void slow_copy_page(void *to, void *from)
+{
+        int d0, d1, d2;
+        __asm__ __volatile__( \
+                "cld\n\t" \
+                "rep ; movsl" \
+                : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
+                : "0" (1024),"1" ((long) to),"2" ((long) from) \
+                : "memory");
+}
+  
+void mmx_copy_page(void *to, void *from)
+{
+        if(unlikely(in_interrupt()))
+                slow_copy_page(to, from);
+        else
+                fast_copy_page(to, from);
+}
+EXPORT_SYMBOL(_mmx_memcpy);
+EXPORT_SYMBOL(mmx_clear_page);
+EXPORT_SYMBOL(mmx_copy_page);
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
new file mode 100644
index 000000000000..7767962f25d3
--- /dev/null
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -0,0 +1,119 @@
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/msr.h>
+struct msr_info {
+        u32 msr_no;
+        u32 l, h;
+        int err;
+};
+static void __rdmsr_on_cpu(void *info)
+{
+        struct msr_info *rv = info;
+        rdmsr(rv->msr_no, rv->l, rv->h);
+}
+static void __rdmsr_safe_on_cpu(void *info)
+{
+        struct msr_info *rv = info;
+        rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
+}
+static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
+{
+        int err = 0;
+        preempt_disable();
+        if (smp_processor_id() == cpu)
+                if (safe)
+                        err = rdmsr_safe(msr_no, l, h);
+                else
+                        rdmsr(msr_no, *l, *h);
+        else {
+                struct msr_info rv;
+                rv.msr_no = msr_no;
+                if (safe) {
+                        smp_call_function_single(cpu, __rdmsr_safe_on_cpu,
+                                                 &rv, 0, 1);
+                        err = rv.err;
+                } else {
+                        smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1);
+                }
+                *l = rv.l;
+                *h = rv.h;
+        }
+        preempt_enable();
+        return err;
+}
+static void __wrmsr_on_cpu(void *info)
+{
+        struct msr_info *rv = info;
+        wrmsr(rv->msr_no, rv->l, rv->h);
+}
+static void __wrmsr_safe_on_cpu(void *info)
+{
+        struct msr_info *rv = info;
+        rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
+}
+static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
+{
+        int err = 0;
+        preempt_disable();
+        if (smp_processor_id() == cpu)
+                if (safe)
+                        err = wrmsr_safe(msr_no, l, h);
+                else
+                        wrmsr(msr_no, l, h);
+        else {
+                struct msr_info rv;
+                rv.msr_no = msr_no;
+                rv.l = l;
+                rv.h = h;
+                if (safe) {
+                        smp_call_function_single(cpu, __wrmsr_safe_on_cpu,
+                                                 &rv, 0, 1);
+                        err = rv.err;
+                } else {
+                        smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1);
+                }
+        }
+        preempt_enable();
+        return err;
+}
+void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+        _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
+}
+void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+        _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
+}
+/* These "safe" variants are slower and should be used when the target MSR
+   may not actually exist. */
+int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+        return _wrmsr_on_cpu(cpu, msr_no, l, h, 1);
+}
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+        return _rdmsr_on_cpu(cpu, msr_no, l, h, 1);
+}
+EXPORT_SYMBOL(rdmsr_on_cpu);
+EXPORT_SYMBOL(wrmsr_on_cpu);
+EXPORT_SYMBOL(rdmsr_safe_on_cpu);
+EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/putuser_32.S b/arch/x86/lib/putuser_32.S
new file mode 100644
index 000000000000..f58fba109d18
--- /dev/null
+++ b/arch/x86/lib/putuser_32.S
@@ -0,0 +1,98 @@
+/*
+ * __put_user functions.
+ *
+ * (C) Copyright 2005 Linus Torvalds
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/thread_info.h>
+/*
+ * __put_user_X
+ *
+ * Inputs:      %eax[:%edx] contains the data
+ *              %ecx contains the address
+ *
+ * Outputs:     %eax is error code (0 or -EFAULT)
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+#define ENTER   CFI_STARTPROC ; \
+                pushl %ebx ; \
+                CFI_ADJUST_CFA_OFFSET 4 ; \
+                CFI_REL_OFFSET ebx, 0 ; \
+                GET_THREAD_INFO(%ebx)
+#define EXIT    popl %ebx ; \
+                CFI_ADJUST_CFA_OFFSET -4 ; \
+                CFI_RESTORE ebx ; \
+                ret ; \
+                CFI_ENDPROC
+.text
+ENTRY(__put_user_1)
+        ENTER
+        cmpl TI_addr_limit(%ebx),%ecx
+        jae bad_put_user
+1:      movb %al,(%ecx)
+        xorl %eax,%eax
+        EXIT
+ENDPROC(__put_user_1)
+ENTRY(__put_user_2)
+        ENTER
+        movl TI_addr_limit(%ebx),%ebx
+        subl $1,%ebx
+        cmpl %ebx,%ecx
+        jae bad_put_user
+2:      movw %ax,(%ecx)
+        xorl %eax,%eax
+        EXIT
+ENDPROC(__put_user_2)
+ENTRY(__put_user_4)
+        ENTER
+        movl TI_addr_limit(%ebx),%ebx
+        subl $3,%ebx
+        cmpl %ebx,%ecx
+        jae bad_put_user
+3:      movl %eax,(%ecx)
+        xorl %eax,%eax
+        EXIT
+ENDPROC(__put_user_4)
+ENTRY(__put_user_8)
+        ENTER
+        movl TI_addr_limit(%ebx),%ebx
+        subl $7,%ebx
+        cmpl %ebx,%ecx
+        jae bad_put_user
+4:      movl %eax,(%ecx)
+5:      movl %edx,4(%ecx)
+        xorl %eax,%eax
+        EXIT
+ENDPROC(__put_user_8)
+bad_put_user:
+        CFI_STARTPROC simple
+        CFI_DEF_CFA esp, 2*4
+        CFI_OFFSET eip, -1*4
+        CFI_OFFSET ebx, -2*4
+        movl $-14,%eax
+        EXIT
+END(bad_put_user)
+.section __ex_table,"a"
+        .long 1b,bad_put_user
+        .long 2b,bad_put_user
+        .long 3b,bad_put_user
+        .long 4b,bad_put_user
+        .long 5b,bad_put_user
+.previous
diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S
new file mode 100644
index 000000000000..4989f5a8fa9b
--- /dev/null
+++ b/arch/x86/lib/putuser_64.S
@@ -0,0 +1,106 @@
+/*
+ * __put_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+/*
+ * __put_user_X
+ *
+ * Inputs:      %rcx contains the address
+ *              %rdx contains new value
+ *
+ * Outputs:     %rax is error code (0 or -EFAULT)
+ *
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+        .text
+ENTRY(__put_user_1)
+        CFI_STARTPROC
+        GET_THREAD_INFO(%r8)
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae bad_put_user
+1:      movb %dl,(%rcx)
+        xorl %eax,%eax
+        ret
+        CFI_ENDPROC
+ENDPROC(__put_user_1)
+ENTRY(__put_user_2)
+        CFI_STARTPROC
+        GET_THREAD_INFO(%r8)
+        addq $1,%rcx
+        jc 20f
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae 20f
+        decq %rcx
+2:      movw %dx,(%rcx)
+        xorl %eax,%eax
+        ret
+20:     decq %rcx
+        jmp bad_put_user
+        CFI_ENDPROC
+ENDPROC(__put_user_2)
+ENTRY(__put_user_4)
+        CFI_STARTPROC
+        GET_THREAD_INFO(%r8)
+        addq $3,%rcx
+        jc 30f
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae 30f
+        subq $3,%rcx
+3:      movl %edx,(%rcx)
+        xorl %eax,%eax
+        ret
+30:     subq $3,%rcx
+        jmp bad_put_user
+        CFI_ENDPROC
+ENDPROC(__put_user_4)
+ENTRY(__put_user_8)
+        CFI_STARTPROC
+        GET_THREAD_INFO(%r8)
+        addq $7,%rcx
+        jc 40f
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae 40f
+        subq $7,%rcx
+4:      movq %rdx,(%rcx)
+        xorl %eax,%eax
+        ret
+40:     subq $7,%rcx
+        jmp bad_put_user
+        CFI_ENDPROC
+ENDPROC(__put_user_8)
+bad_put_user:
+        CFI_STARTPROC
+        movq $(-EFAULT),%rax
+        ret
+        CFI_ENDPROC
+END(bad_put_user)
+.section __ex_table,"a"
+        .quad 1b,bad_put_user
+        .quad 2b,bad_put_user
+        .quad 3b,bad_put_user
+        .quad 4b,bad_put_user
+.previous
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S
new file mode 100644
index 000000000000..0cde1f807314
--- /dev/null
+++ b/arch/x86/lib/rwlock_64.S
@@ -0,0 +1,38 @@
+/* Slow paths of read/write spinlocks. */
+#include <linux/linkage.h>
+#include <asm/rwlock.h>
+#include <asm/alternative-asm.i>
+#include <asm/dwarf2.h>
+/* rdi: pointer to rwlock_t */
+ENTRY(__write_lock_failed)
+        CFI_STARTPROC
+        LOCK_PREFIX
+        addl $RW_LOCK_BIAS,(%rdi)
+1:      rep
+        nop
+        cmpl $RW_LOCK_BIAS,(%rdi)
+        jne 1b
+        LOCK_PREFIX
+        subl $RW_LOCK_BIAS,(%rdi)
+        jnz  __write_lock_failed
+        ret
+        CFI_ENDPROC
+END(__write_lock_failed)
+/* rdi: pointer to rwlock_t */
+ENTRY(__read_lock_failed)
+        CFI_STARTPROC
+        LOCK_PREFIX
+        incl (%rdi)
+1:      rep
+        nop
+        cmpl $1,(%rdi)
+        js 1b
+        LOCK_PREFIX
+        decl (%rdi)
+        js __read_lock_failed
+        ret
+        CFI_ENDPROC
+END(__read_lock_failed)
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
new file mode 100644
index 000000000000..c01eb39c0b43
--- /dev/null
+++ b/arch/x86/lib/semaphore_32.S
@@ -0,0 +1,219 @@
+/*
+ * i386 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+#include <linux/linkage.h>
+#include <asm/rwlock.h>
+#include <asm/alternative-asm.i>
+#include <asm/frame.i>
+#include <asm/dwarf2.h>
+/*
+ * The semaphore operations have a special calling sequence that
+ * allow us to do a simpler in-line version of them. These routines
+ * need to convert that sequence back into the C sequence when
+ * there is contention on the semaphore.
+ *
+ * %eax contains the semaphore pointer on entry. Save the C-clobbered
+ * registers (%eax, %edx and %ecx) except %eax whish is either a return
+ * value or just clobbered..
+ */
+        .section .sched.text
+ENTRY(__down_failed)
+        CFI_STARTPROC
+        FRAME
+        pushl %edx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET edx,0
+        pushl %ecx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ecx,0
+        call __down
+        popl %ecx
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE ecx
+        popl %edx
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE edx
+        ENDFRAME
+        ret
+        CFI_ENDPROC
+        END(__down_failed)
+ENTRY(__down_failed_interruptible)
+        CFI_STARTPROC
+        FRAME
+        pushl %edx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET edx,0
+        pushl %ecx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ecx,0
+        call __down_interruptible
+        popl %ecx
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE ecx
+        popl %edx
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE edx
+        ENDFRAME
+        ret
+        CFI_ENDPROC
+        END(__down_failed_interruptible)
+ENTRY(__down_failed_trylock)
+        CFI_STARTPROC
+        FRAME
+        pushl %edx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET edx,0
+        pushl %ecx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ecx,0
+        call __down_trylock
+        popl %ecx
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE ecx
+        popl %edx
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE edx
+        ENDFRAME
+        ret
+        CFI_ENDPROC
+        END(__down_failed_trylock)
+ENTRY(__up_wakeup)
+        CFI_STARTPROC
+        FRAME
+        pushl %edx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET edx,0
+        pushl %ecx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ecx,0
+        call __up
+        popl %ecx
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE ecx
+        popl %edx
+        CFI_ADJUST_CFA_OFFSET -4
+        CFI_RESTORE edx
+        ENDFRAME
+        ret
+        CFI_ENDPROC
+        END(__up_wakeup)
+/*
+ * rw spinlock fallbacks
+ */
+#ifdef CONFIG_SMP
+ENTRY(__write_lock_failed)
+        CFI_STARTPROC simple
+        FRAME
+2:      LOCK_PREFIX
+        addl    $ RW_LOCK_BIAS,(%eax)
+1:      rep; nop
+        cmpl    $ RW_LOCK_BIAS,(%eax)
+        jne     1b
+        LOCK_PREFIX
+        subl    $ RW_LOCK_BIAS,(%eax)
+        jnz     2b
+        ENDFRAME
+        ret
+        CFI_ENDPROC
+        END(__write_lock_failed)
+ENTRY(__read_lock_failed)
+        CFI_STARTPROC
+        FRAME
+2:      LOCK_PREFIX
+        incl    (%eax)
+1:      rep; nop
+        cmpl    $1,(%eax)
+        js      1b
+        LOCK_PREFIX
+        decl    (%eax)
+        js      2b
+        ENDFRAME
+        ret
+        CFI_ENDPROC
+        END(__read_lock_failed)
+#endif
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_down_read_failed)
+        CFI_STARTPROC
+        push %ecx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ecx,0
+        push %edx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET edx,0
+        call rwsem_down_read_failed
+        pop %edx
+        CFI_ADJUST_CFA_OFFSET -4
+        pop %ecx
+        CFI_ADJUST_CFA_OFFSET -4
+        ret
+        CFI_ENDPROC
+        END(call_rwsem_down_read_failed)
+ENTRY(call_rwsem_down_write_failed)
+        CFI_STARTPROC
+        push %ecx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ecx,0
+        calll rwsem_down_write_failed
+        pop %ecx
+        CFI_ADJUST_CFA_OFFSET -4
+        ret
+        CFI_ENDPROC
+        END(call_rwsem_down_write_failed)
+ENTRY(call_rwsem_wake)
+        CFI_STARTPROC
+        decw %dx    /* do nothing if still outstanding active readers */
+        jnz 1f
+        push %ecx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ecx,0
+        call rwsem_wake
+        pop %ecx
+        CFI_ADJUST_CFA_OFFSET -4
+1:      ret
+        CFI_ENDPROC
+        END(call_rwsem_wake)
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_downgrade_wake)
+        CFI_STARTPROC
+        push %ecx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET ecx,0
+        push %edx
+        CFI_ADJUST_CFA_OFFSET 4
+        CFI_REL_OFFSET edx,0
+        call rwsem_downgrade_wake
+        pop %edx
+        CFI_ADJUST_CFA_OFFSET -4
+        pop %ecx
+        CFI_ADJUST_CFA_OFFSET -4
+        ret
+        CFI_ENDPROC
+        END(call_rwsem_downgrade_wake)
+#endif
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
new file mode 100644
index 000000000000..2c773fefa3dd
--- /dev/null
+++ b/arch/x86/lib/string_32.c
@@ -0,0 +1,257 @@
+/*
+ * Most of the string-functions are rather heavily hand-optimized,
+ * see especially strsep,strstr,str[c]spn. They should work, but are not
+ * very easy to understand. Everything is done entirely within the register
+ * set, making the functions fast and clean. String instructions have been
+ * used through-out, making for "slightly" unclear code :-)
+ *
+ * AK: On P4 and K7 using non string instruction implementations might be faster
+ * for large memory blocks. But most of them are unlikely to be used on large
+ * strings.
+ */
+#include <linux/string.h>
+#include <linux/module.h>
+#ifdef __HAVE_ARCH_STRCPY
+char *strcpy(char * dest,const char *src)
+{
+        int d0, d1, d2;
+        asm volatile( "1:\tlodsb\n\t"
+                "stosb\n\t"
+                "testb %%al,%%al\n\t"
+                "jne 1b"
+                : "=&S" (d0), "=&D" (d1), "=&a" (d2)
+                :"0" (src),"1" (dest) : "memory");
+        return dest;
+}
+EXPORT_SYMBOL(strcpy);
+#endif
+#ifdef __HAVE_ARCH_STRNCPY
+char *strncpy(char * dest,const char *src,size_t count)
+{
+        int d0, d1, d2, d3;
+        asm volatile( "1:\tdecl %2\n\t"
+                "js 2f\n\t"
+                "lodsb\n\t"
+                "stosb\n\t"
+                "testb %%al,%%al\n\t"
+                "jne 1b\n\t"
+                "rep\n\t"
+                "stosb\n"
+                "2:"
+                : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
+                :"0" (src),"1" (dest),"2" (count) : "memory");
+        return dest;
+}
+EXPORT_SYMBOL(strncpy);
+#endif
+#ifdef __HAVE_ARCH_STRCAT
+char *strcat(char * dest,const char * src)
+{
+        int d0, d1, d2, d3;
+        asm volatile( "repne\n\t"
+                "scasb\n\t"
+                "decl %1\n"
+                "1:\tlodsb\n\t"
+                "stosb\n\t"
+                "testb %%al,%%al\n\t"
+                "jne 1b"
+                : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+                : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory");
+        return dest;
+}
+EXPORT_SYMBOL(strcat);
+#endif
+#ifdef __HAVE_ARCH_STRNCAT
+char *strncat(char * dest,const char * src,size_t count)
+{
+        int d0, d1, d2, d3;
+        asm volatile( "repne\n\t"
+                "scasb\n\t"
+                "decl %1\n\t"
+                "movl %8,%3\n"
+                "1:\tdecl %3\n\t"
+                "js 2f\n\t"
+                "lodsb\n\t"
+                "stosb\n\t"
+                "testb %%al,%%al\n\t"
+                "jne 1b\n"
+                "2:\txorl %2,%2\n\t"
+                "stosb"
+                : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+                : "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count)
+                : "memory");
+        return dest;
+}
+EXPORT_SYMBOL(strncat);
+#endif
+#ifdef __HAVE_ARCH_STRCMP
+int strcmp(const char * cs,const char * ct)
+{
+        int d0, d1;
+        int res;
+        asm volatile( "1:\tlodsb\n\t"
+                "scasb\n\t"
+                "jne 2f\n\t"
+                "testb %%al,%%al\n\t"
+                "jne 1b\n\t"
+                "xorl %%eax,%%eax\n\t"
+                "jmp 3f\n"
+                "2:\tsbbl %%eax,%%eax\n\t"
+                "orb $1,%%al\n"
+                "3:"
+                :"=a" (res), "=&S" (d0), "=&D" (d1)
+                :"1" (cs),"2" (ct)
+                :"memory");
+        return res;
+}
+EXPORT_SYMBOL(strcmp);
+#endif
+#ifdef __HAVE_ARCH_STRNCMP
+int strncmp(const char * cs,const char * ct,size_t count)
+{
+        int res;
+        int d0, d1, d2;
+        asm volatile( "1:\tdecl %3\n\t"
+                "js 2f\n\t"
+                "lodsb\n\t"
+                "scasb\n\t"
+                "jne 3f\n\t"
+                "testb %%al,%%al\n\t"
+                "jne 1b\n"
+                "2:\txorl %%eax,%%eax\n\t"
+                "jmp 4f\n"
+                "3:\tsbbl %%eax,%%eax\n\t"
+                "orb $1,%%al\n"
+                "4:"
+                :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
+                :"1" (cs),"2" (ct),"3" (count)
+                :"memory");
+        return res;
+}
+EXPORT_SYMBOL(strncmp);
+#endif
+#ifdef __HAVE_ARCH_STRCHR
+char *strchr(const char * s, int c)
+{
+        int d0;
+        char * res;
+        asm volatile( "movb %%al,%%ah\n"
+                "1:\tlodsb\n\t"
+                "cmpb %%ah,%%al\n\t"
+                "je 2f\n\t"
+                "testb %%al,%%al\n\t"
+                "jne 1b\n\t"
+                "movl $1,%1\n"
+                "2:\tmovl %1,%0\n\t"
+                "decl %0"
+                :"=a" (res), "=&S" (d0)
+                :"1" (s),"0" (c)
+                :"memory");
+        return res;
+}
+EXPORT_SYMBOL(strchr);
+#endif
+#ifdef __HAVE_ARCH_STRRCHR
+char *strrchr(const char * s, int c)
+{
+        int d0, d1;
+        char * res;
+        asm volatile( "movb %%al,%%ah\n"
+                "1:\tlodsb\n\t"
+                "cmpb %%ah,%%al\n\t"
+                "jne 2f\n\t"
+                "leal -1(%%esi),%0\n"
+                "2:\ttestb %%al,%%al\n\t"
+                "jne 1b"
+                :"=g" (res), "=&S" (d0), "=&a" (d1)
+                :"0" (0),"1" (s),"2" (c)
+                :"memory");
+        return res;
+}
+EXPORT_SYMBOL(strrchr);
+#endif
+#ifdef __HAVE_ARCH_STRLEN
+size_t strlen(const char * s)
+{
+        int d0;
+        int res;
+        asm volatile( "repne\n\t"
+                "scasb\n\t"
+                "notl %0\n\t"
+                "decl %0"
+                :"=c" (res), "=&D" (d0)
+                :"1" (s),"a" (0), "0" (0xffffffffu)
+                :"memory");
+        return res;
+}
+EXPORT_SYMBOL(strlen);
+#endif
+#ifdef __HAVE_ARCH_MEMCHR
+void *memchr(const void *cs,int c,size_t count)
+{
+        int d0;
+        void *res;
+        if (!count)
+                return NULL;
+        asm volatile( "repne\n\t"
+                "scasb\n\t"
+                "je 1f\n\t"
+                "movl $1,%0\n"
+                "1:\tdecl %0"
+                :"=D" (res), "=&c" (d0)
+                :"a" (c),"0" (cs),"1" (count)
+                :"memory");
+        return res;
+}
+EXPORT_SYMBOL(memchr);
+#endif
+#ifdef __HAVE_ARCH_MEMSCAN
+void *memscan(void * addr, int c, size_t size)
+{
+        if (!size)
+                return addr;
+        asm volatile("repnz; scasb\n\t"
+            "jnz 1f\n\t"
+            "dec %%edi\n"
+            "1:"
+            : "=D" (addr), "=c" (size)
+            : "0" (addr), "1" (size), "a" (c)
+            : "memory");
+        return addr;
+}
+EXPORT_SYMBOL(memscan);
+#endif
+#ifdef __HAVE_ARCH_STRNLEN
+size_t strnlen(const char *s, size_t count)
+{
+        int d0;
+        int res;
+        asm volatile( "movl %2,%0\n\t"
+                "jmp 2f\n"
+                "1:\tcmpb $0,(%0)\n\t"
+                "je 3f\n\t"
+                "incl %0\n"
+                "2:\tdecl %1\n\t"
+                "cmpl $-1,%1\n\t"
+                "jne 1b\n"
+                "3:\tsubl %2,%0"
+                :"=a" (res), "=&d" (d0)
+                :"c" (s),"1" (count)
+                :"memory");
+        return res;
+}
+EXPORT_SYMBOL(strnlen);
+#endif
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
new file mode 100644
index 000000000000..a3dafbf59dae
--- /dev/null
+++ b/arch/x86/lib/strstr_32.c
@@ -0,0 +1,31 @@
+#include <linux/string.h>
+char * strstr(const char * cs,const char * ct)
+{
+int     d0, d1;
+register char * __res;
+__asm__ __volatile__(
+        "movl %6,%%edi\n\t"
+        "repne\n\t"
+        "scasb\n\t"
+        "notl %%ecx\n\t"
+        "decl %%ecx\n\t"        /* NOTE! This also sets Z if searchstring='' */
+        "movl %%ecx,%%edx\n"
+        "1:\tmovl %6,%%edi\n\t"
+        "movl %%esi,%%eax\n\t"
+        "movl %%edx,%%ecx\n\t"
+        "repe\n\t"
+        "cmpsb\n\t"
+        "je 2f\n\t"             /* also works for empty string, see above */
+        "xchgl %%eax,%%esi\n\t"
+        "incl %%esi\n\t"
+        "cmpb $0,-1(%%eax)\n\t"
+        "jne 1b\n\t"
+        "xorl %%eax,%%eax\n\t"
+        "2:"
+        :"=a" (__res), "=&c" (d0), "=&S" (d1)
+        :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
+        :"dx", "di");
+return __res;
+}
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
new file mode 100644
index 000000000000..55e586d352d3
--- /dev/null
+++ b/arch/x86/lib/thunk_64.S
@@ -0,0 +1,67 @@
+/*
+ * Save registers before calling assembly functions. This avoids
+ * disturbance of register allocation in some inline assembly constructs.
+ * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
+        #include <linux/linkage.h>
+        #include <asm/dwarf2.h>
+        #include <asm/calling.h>                        
+        #include <asm/rwlock.h>
+                
+        /* rdi: arg1 ... normal C conventions. rax is saved/restored. */        
+        .macro thunk name,func
+        .globl \name
+\name:  
+        CFI_STARTPROC
+        SAVE_ARGS
+        call \func
+        jmp  restore
+        CFI_ENDPROC
+        .endm
+        /* rdi: arg1 ... normal C conventions. rax is passed from C. */         
+        .macro thunk_retrax name,func
+        .globl \name
+\name:  
+        CFI_STARTPROC
+        SAVE_ARGS
+        call \func
+        jmp  restore_norax
+        CFI_ENDPROC
+        .endm
+        
+        .section .sched.text
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
+        thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
+        thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
+        thunk rwsem_wake_thunk,rwsem_wake
+        thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
+#endif  
+        
+        thunk __down_failed,__down
+        thunk_retrax __down_failed_interruptible,__down_interruptible
+        thunk_retrax __down_failed_trylock,__down_trylock
+        thunk __up_wakeup,__up
+#ifdef CONFIG_TRACE_IRQFLAGS
+        thunk trace_hardirqs_on_thunk,trace_hardirqs_on
+        thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+#endif
+        
+        /* SAVE_ARGS below is used only for the .cfi directives it contains. */
+        CFI_STARTPROC
+        SAVE_ARGS
+restore:
+        RESTORE_ARGS
+        ret     
+        CFI_ENDPROC
+        
+        CFI_STARTPROC
+        SAVE_ARGS
+restore_norax:  
+        RESTORE_ARGS 1
+        ret
+        CFI_ENDPROC
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
new file mode 100644
index 000000000000..9f38b12b4af1
--- /dev/null
+++ b/arch/x86/lib/usercopy_32.c
@@ -0,0 +1,882 @@
+/* 
+ * User address space access functions.
+ * The non inlined parts of asm-i386/uaccess.h are here.
+ *
+ * Copyright 1997 Andi Kleen <ak@muc.de>
+ * Copyright 1997 Linus Torvalds
+ */
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+#include <asm/mmx.h>
+static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
+{
+#ifdef CONFIG_X86_INTEL_USERCOPY
+        if (n >= 64 && ((a1 ^ a2) & movsl_mask.mask))
+                return 0;
+#endif
+        return 1;
+}
+#define movsl_is_ok(a1,a2,n) \
+        __movsl_is_ok((unsigned long)(a1),(unsigned long)(a2),(n))
+/*
+ * Copy a null terminated string from userspace.
+ */
+#define __do_strncpy_from_user(dst,src,count,res)                          \
+do {                                                                       \
+        int __d0, __d1, __d2;                                              \
+        might_sleep();                                                     \
+        __asm__ __volatile__(                                              \
+                "       testl %1,%1\n"                                     \
+                "       jz 2f\n"                                           \
+                "0:     lodsb\n"                                           \
+                "       stosb\n"                                           \
+                "       testb %%al,%%al\n"                                 \
+                "       jz 1f\n"                                           \
+                "       decl %1\n"                                         \
+                "       jnz 0b\n"                                          \
+                "1:     subl %1,%0\n"                                      \
+                "2:\n"                                                     \
+                ".section .fixup,\"ax\"\n"                                 \
+                "3:     movl %5,%0\n"                                      \
+                "       jmp 2b\n"                                          \
+                ".previous\n"                                              \
+                ".section __ex_table,\"a\"\n"                              \
+                "       .align 4\n"                                        \
+                "       .long 0b,3b\n"                                     \
+                ".previous"                                                \
+                : "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),      \
+                  "=&D" (__d2)                                             \
+                : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
+                : "memory");                                               \
+} while (0)
+/**
+ * __strncpy_from_user: - Copy a NUL terminated string from userspace, with less checking.
+ * @dst:   Destination address, in kernel space.  This buffer must be at
+ *         least @count bytes long.
+ * @src:   Source address, in user space.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ * 
+ * Copies a NUL-terminated string from userspace to kernel space.
+ * Caller must check the specified block with access_ok() before calling
+ * this function.
+ *
+ * On success, returns the length of the string (not including the trailing
+ * NUL).
+ *
+ * If access to userspace fails, returns -EFAULT (some data may have been
+ * copied).
+ *
+ * If @count is smaller than the length of the string, copies @count bytes
+ * and returns @count.
+ */
+long
+__strncpy_from_user(char *dst, const char __user *src, long count)
+{
+        long res;
+        __do_strncpy_from_user(dst, src, count, res);
+        return res;
+}
+EXPORT_SYMBOL(__strncpy_from_user);
+/**
+ * strncpy_from_user: - Copy a NUL terminated string from userspace.
+ * @dst:   Destination address, in kernel space.  This buffer must be at
+ *         least @count bytes long.
+ * @src:   Source address, in user space.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ * 
+ * Copies a NUL-terminated string from userspace to kernel space.
+ *
+ * On success, returns the length of the string (not including the trailing
+ * NUL).
+ *
+ * If access to userspace fails, returns -EFAULT (some data may have been
+ * copied).
+ *
+ * If @count is smaller than the length of the string, copies @count bytes
+ * and returns @count.
+ */
+long
+strncpy_from_user(char *dst, const char __user *src, long count)
+{
+        long res = -EFAULT;
+        if (access_ok(VERIFY_READ, src, 1))
+                __do_strncpy_from_user(dst, src, count, res);
+        return res;
+}
+EXPORT_SYMBOL(strncpy_from_user);
+/*
+ * Zero Userspace
+ */
+#define __do_clear_user(addr,size)                                      \
+do {                                                                    \
+        int __d0;                                                       \
+        might_sleep();                                                  \
+        __asm__ __volatile__(                                           \
+                "0:     rep; stosl\n"                                   \
+                "       movl %2,%0\n"                                   \
+                "1:     rep; stosb\n"                                   \
+                "2:\n"                                                  \
+                ".section .fixup,\"ax\"\n"                              \
+                "3:     lea 0(%2,%0,4),%0\n"                            \
+                "       jmp 2b\n"                                       \
+                ".previous\n"                                           \
+                ".section __ex_table,\"a\"\n"                           \
+                "       .align 4\n"                                     \
+                "       .long 0b,3b\n"                                  \
+                "       .long 1b,2b\n"                                  \
+                ".previous"                                             \
+                : "=&c"(size), "=&D" (__d0)                             \
+                : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0));     \
+} while (0)
+/**
+ * clear_user: - Zero a block of memory in user space.
+ * @to:   Destination address, in user space.
+ * @n:    Number of bytes to zero.
+ *
+ * Zero a block of memory in user space.
+ *
+ * Returns number of bytes that could not be cleared.
+ * On success, this will be zero.
+ */
+unsigned long
+clear_user(void __user *to, unsigned long n)
+{
+        might_sleep();
+        if (access_ok(VERIFY_WRITE, to, n))
+                __do_clear_user(to, n);
+        return n;
+}
+EXPORT_SYMBOL(clear_user);
+/**
+ * __clear_user: - Zero a block of memory in user space, with less checking.
+ * @to:   Destination address, in user space.
+ * @n:    Number of bytes to zero.
+ *
+ * Zero a block of memory in user space.  Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be cleared.
+ * On success, this will be zero.
+ */
+unsigned long
+__clear_user(void __user *to, unsigned long n)
+{
+        __do_clear_user(to, n);
+        return n;
+}
+EXPORT_SYMBOL(__clear_user);
+/**
+ * strnlen_user: - Get the size of a string in user space.
+ * @s: The string to measure.
+ * @n: The maximum valid length
+ *
+ * Get the size of a NUL-terminated string in user space.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ * On exception, returns 0.
+ * If the string is too long, returns a value greater than @n.
+ */
+long strnlen_user(const char __user *s, long n)
+{
+        unsigned long mask = -__addr_ok(s);
+        unsigned long res, tmp;
+        might_sleep();
+        __asm__ __volatile__(
+                "       testl %0, %0\n"
+                "       jz 3f\n"
+                "       andl %0,%%ecx\n"
+                "0:     repne; scasb\n"
+                "       setne %%al\n"
+                "       subl %%ecx,%0\n"
+                "       addl %0,%%eax\n"
+                "1:\n"
+                ".section .fixup,\"ax\"\n"
+                "2:     xorl %%eax,%%eax\n"
+                "       jmp 1b\n"
+                "3:     movb $1,%%al\n"
+                "       jmp 1b\n"
+                ".previous\n"
+                ".section __ex_table,\"a\"\n"
+                "       .align 4\n"
+                "       .long 0b,2b\n"
+                ".previous"
+                :"=r" (n), "=D" (s), "=a" (res), "=c" (tmp)
+                :"0" (n), "1" (s), "2" (0), "3" (mask)
+                :"cc");
+        return res & mask;
+}
+EXPORT_SYMBOL(strnlen_user);
+#ifdef CONFIG_X86_INTEL_USERCOPY
+static unsigned long
+__copy_user_intel(void __user *to, const void *from, unsigned long size)
+{
+        int d0, d1;
+        __asm__ __volatile__(
+                       "       .align 2,0x90\n"
+                       "1:     movl 32(%4), %%eax\n"
+                       "       cmpl $67, %0\n"
+                       "       jbe 3f\n"
+                       "2:     movl 64(%4), %%eax\n"
+                       "       .align 2,0x90\n"
+                       "3:     movl 0(%4), %%eax\n"
+                       "4:     movl 4(%4), %%edx\n"
+                       "5:     movl %%eax, 0(%3)\n"
+                       "6:     movl %%edx, 4(%3)\n"
+                       "7:     movl 8(%4), %%eax\n"
+                       "8:     movl 12(%4),%%edx\n"
+                       "9:     movl %%eax, 8(%3)\n"
+                       "10:    movl %%edx, 12(%3)\n"
+                       "11:    movl 16(%4), %%eax\n"
+                       "12:    movl 20(%4), %%edx\n"
+                       "13:    movl %%eax, 16(%3)\n"
+                       "14:    movl %%edx, 20(%3)\n"
+                       "15:    movl 24(%4), %%eax\n"
+                       "16:    movl 28(%4), %%edx\n"
+                       "17:    movl %%eax, 24(%3)\n"
+                       "18:    movl %%edx, 28(%3)\n"
+                       "19:    movl 32(%4), %%eax\n"
+                       "20:    movl 36(%4), %%edx\n"
+                       "21:    movl %%eax, 32(%3)\n"
+                       "22:    movl %%edx, 36(%3)\n"
+                       "23:    movl 40(%4), %%eax\n"
+                       "24:    movl 44(%4), %%edx\n"
+                       "25:    movl %%eax, 40(%3)\n"
+                       "26:    movl %%edx, 44(%3)\n"
+                       "27:    movl 48(%4), %%eax\n"
+                       "28:    movl 52(%4), %%edx\n"
+                       "29:    movl %%eax, 48(%3)\n"
+                       "30:    movl %%edx, 52(%3)\n"
+                       "31:    movl 56(%4), %%eax\n"
+                       "32:    movl 60(%4), %%edx\n"
+                       "33:    movl %%eax, 56(%3)\n"
+                       "34:    movl %%edx, 60(%3)\n"
+                       "       addl $-64, %0\n"
+                       "       addl $64, %4\n"
+                       "       addl $64, %3\n"
+                       "       cmpl $63, %0\n"
+                       "       ja  1b\n"
+                       "35:    movl  %0, %%eax\n"
+                       "       shrl  $2, %0\n"
+                       "       andl  $3, %%eax\n"
+                       "       cld\n"
+                       "99:    rep; movsl\n"
+                       "36:    movl %%eax, %0\n"
+                       "37:    rep; movsb\n"
+                       "100:\n"
+                       ".section .fixup,\"ax\"\n"
+                       "101:   lea 0(%%eax,%0,4),%0\n"
+                       "       jmp 100b\n"
+                       ".previous\n"
+                       ".section __ex_table,\"a\"\n"
+                       "       .align 4\n"
+                       "       .long 1b,100b\n"
+                       "       .long 2b,100b\n"
+                       "       .long 3b,100b\n"
+                       "       .long 4b,100b\n"
+                       "       .long 5b,100b\n"
+                       "       .long 6b,100b\n"
+                       "       .long 7b,100b\n"
+                       "       .long 8b,100b\n"
+                       "       .long 9b,100b\n"
+                       "       .long 10b,100b\n"
+                       "       .long 11b,100b\n"
+                       "       .long 12b,100b\n"
+                       "       .long 13b,100b\n"
+                       "       .long 14b,100b\n"
+                       "       .long 15b,100b\n"
+                       "       .long 16b,100b\n"
+                       "       .long 17b,100b\n"
+                       "       .long 18b,100b\n"
+                       "       .long 19b,100b\n"
+                       "       .long 20b,100b\n"
+                       "       .long 21b,100b\n"
+                       "       .long 22b,100b\n"
+                       "       .long 23b,100b\n"
+                       "       .long 24b,100b\n"
+                       "       .long 25b,100b\n"
+                       "       .long 26b,100b\n"
+                       "       .long 27b,100b\n"
+                       "       .long 28b,100b\n"
+                       "       .long 29b,100b\n"
+                       "       .long 30b,100b\n"
+                       "       .long 31b,100b\n"
+                       "       .long 32b,100b\n"
+                       "       .long 33b,100b\n"
+                       "       .long 34b,100b\n"
+                       "       .long 35b,100b\n"
+                       "       .long 36b,100b\n"
+                       "       .long 37b,100b\n"
+                       "       .long 99b,101b\n"
+                       ".previous"
+                       : "=&c"(size), "=&D" (d0), "=&S" (d1)
+                       :  "1"(to), "2"(from), "0"(size)
+                       : "eax", "edx", "memory");
+        return size;
+}
+static unsigned long
+__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
+{
+        int d0, d1;
+        __asm__ __volatile__(
+                       "        .align 2,0x90\n"
+                       "0:      movl 32(%4), %%eax\n"
+                       "        cmpl $67, %0\n"      
+                       "        jbe 2f\n"            
+                       "1:      movl 64(%4), %%eax\n"
+                       "        .align 2,0x90\n"     
+                       "2:      movl 0(%4), %%eax\n" 
+                       "21:     movl 4(%4), %%edx\n" 
+                       "        movl %%eax, 0(%3)\n" 
+                       "        movl %%edx, 4(%3)\n" 
+                       "3:      movl 8(%4), %%eax\n" 
+                       "31:     movl 12(%4),%%edx\n" 
+                       "        movl %%eax, 8(%3)\n" 
+                       "        movl %%edx, 12(%3)\n"
+                       "4:      movl 16(%4), %%eax\n"
+                       "41:     movl 20(%4), %%edx\n"
+                       "        movl %%eax, 16(%3)\n"
+                       "        movl %%edx, 20(%3)\n"
+                       "10:     movl 24(%4), %%eax\n"
+                       "51:     movl 28(%4), %%edx\n"
+                       "        movl %%eax, 24(%3)\n"
+                       "        movl %%edx, 28(%3)\n"
+                       "11:     movl 32(%4), %%eax\n"
+                       "61:     movl 36(%4), %%edx\n"
+                       "        movl %%eax, 32(%3)\n"
+                       "        movl %%edx, 36(%3)\n"
+                       "12:     movl 40(%4), %%eax\n"
+                       "71:     movl 44(%4), %%edx\n"
+                       "        movl %%eax, 40(%3)\n"
+                       "        movl %%edx, 44(%3)\n"
+                       "13:     movl 48(%4), %%eax\n"
+                       "81:     movl 52(%4), %%edx\n"
+                       "        movl %%eax, 48(%3)\n"
+                       "        movl %%edx, 52(%3)\n"
+                       "14:     movl 56(%4), %%eax\n"
+                       "91:     movl 60(%4), %%edx\n"
+                       "        movl %%eax, 56(%3)\n"
+                       "        movl %%edx, 60(%3)\n"
+                       "        addl $-64, %0\n"     
+                       "        addl $64, %4\n"      
+                       "        addl $64, %3\n"      
+                       "        cmpl $63, %0\n"      
+                       "        ja  0b\n"            
+                       "5:      movl  %0, %%eax\n"   
+                       "        shrl  $2, %0\n"      
+                       "        andl $3, %%eax\n"    
+                       "        cld\n"               
+                       "6:      rep; movsl\n"   
+                       "        movl %%eax,%0\n"
+                       "7:      rep; movsb\n"   
+                       "8:\n"                   
+                       ".section .fixup,\"ax\"\n"
+                       "9:      lea 0(%%eax,%0,4),%0\n" 
+                       "16:     pushl %0\n"     
+                       "        pushl %%eax\n"  
+                       "        xorl %%eax,%%eax\n"
+                       "        rep; stosb\n"   
+                       "        popl %%eax\n"   
+                       "        popl %0\n"      
+                       "        jmp 8b\n"       
+                       ".previous\n"            
+                       ".section __ex_table,\"a\"\n"
+                       "        .align 4\n"        
+                       "        .long 0b,16b\n"  
+                       "        .long 1b,16b\n"
+                       "        .long 2b,16b\n"
+                       "        .long 21b,16b\n"
+                       "        .long 3b,16b\n" 
+                       "        .long 31b,16b\n"
+                       "        .long 4b,16b\n" 
+                       "        .long 41b,16b\n"
+                       "        .long 10b,16b\n"
+                       "        .long 51b,16b\n"
+                       "        .long 11b,16b\n"
+                       "        .long 61b,16b\n"
+                       "        .long 12b,16b\n"
+                       "        .long 71b,16b\n"
+                       "        .long 13b,16b\n"
+                       "        .long 81b,16b\n"
+                       "        .long 14b,16b\n"
+                       "        .long 91b,16b\n"
+                       "        .long 6b,9b\n"  
+                       "        .long 7b,16b\n" 
+                       ".previous"              
+                       : "=&c"(size), "=&D" (d0), "=&S" (d1)
+                       :  "1"(to), "2"(from), "0"(size)
+                       : "eax", "edx", "memory");
+        return size;
+}
+/*
+ * Non Temporal Hint version of __copy_user_zeroing_intel.  It is cache aware.
+ * hyoshiok@miraclelinux.com
+ */
+static unsigned long __copy_user_zeroing_intel_nocache(void *to,
+                                const void __user *from, unsigned long size)
+{
+        int d0, d1;
+        __asm__ __volatile__(
+               "        .align 2,0x90\n"
+               "0:      movl 32(%4), %%eax\n"
+               "        cmpl $67, %0\n"
+               "        jbe 2f\n"
+               "1:      movl 64(%4), %%eax\n"
+               "        .align 2,0x90\n"
+               "2:      movl 0(%4), %%eax\n"
+               "21:     movl 4(%4), %%edx\n"
+               "        movnti %%eax, 0(%3)\n"
+               "        movnti %%edx, 4(%3)\n"
+               "3:      movl 8(%4), %%eax\n"
+               "31:     movl 12(%4),%%edx\n"
+               "        movnti %%eax, 8(%3)\n"
+               "        movnti %%edx, 12(%3)\n"
+               "4:      movl 16(%4), %%eax\n"
+               "41:     movl 20(%4), %%edx\n"
+               "        movnti %%eax, 16(%3)\n"
+               "        movnti %%edx, 20(%3)\n"
+               "10:     movl 24(%4), %%eax\n"
+               "51:     movl 28(%4), %%edx\n"
+               "        movnti %%eax, 24(%3)\n"
+               "        movnti %%edx, 28(%3)\n"
+               "11:     movl 32(%4), %%eax\n"
+               "61:     movl 36(%4), %%edx\n"
+               "        movnti %%eax, 32(%3)\n"
+               "        movnti %%edx, 36(%3)\n"
+               "12:     movl 40(%4), %%eax\n"
+               "71:     movl 44(%4), %%edx\n"
+               "        movnti %%eax, 40(%3)\n"
+               "        movnti %%edx, 44(%3)\n"
+               "13:     movl 48(%4), %%eax\n"
+               "81:     movl 52(%4), %%edx\n"
+               "        movnti %%eax, 48(%3)\n"
+               "        movnti %%edx, 52(%3)\n"
+               "14:     movl 56(%4), %%eax\n"
+               "91:     movl 60(%4), %%edx\n"
+               "        movnti %%eax, 56(%3)\n"
+               "        movnti %%edx, 60(%3)\n"
+               "        addl $-64, %0\n"
+               "        addl $64, %4\n"
+               "        addl $64, %3\n"
+               "        cmpl $63, %0\n"
+               "        ja  0b\n"
+               "        sfence \n"
+               "5:      movl  %0, %%eax\n"
+               "        shrl  $2, %0\n"
+               "        andl $3, %%eax\n"
+               "        cld\n"
+               "6:      rep; movsl\n"
+               "        movl %%eax,%0\n"
+               "7:      rep; movsb\n"
+               "8:\n"
+               ".section .fixup,\"ax\"\n"
+               "9:      lea 0(%%eax,%0,4),%0\n"
+               "16:     pushl %0\n"
+               "        pushl %%eax\n"
+               "        xorl %%eax,%%eax\n"
+               "        rep; stosb\n"
+               "        popl %%eax\n"
+               "        popl %0\n"
+               "        jmp 8b\n"
+               ".previous\n"
+               ".section __ex_table,\"a\"\n"
+               "        .align 4\n"
+               "        .long 0b,16b\n"
+               "        .long 1b,16b\n"
+               "        .long 2b,16b\n"
+               "        .long 21b,16b\n"
+               "        .long 3b,16b\n"
+               "        .long 31b,16b\n"
+               "        .long 4b,16b\n"
+               "        .long 41b,16b\n"
+               "        .long 10b,16b\n"
+               "        .long 51b,16b\n"
+               "        .long 11b,16b\n"
+               "        .long 61b,16b\n"
+               "        .long 12b,16b\n"
+               "        .long 71b,16b\n"
+               "        .long 13b,16b\n"
+               "        .long 81b,16b\n"
+               "        .long 14b,16b\n"
+               "        .long 91b,16b\n"
+               "        .long 6b,9b\n"
+               "        .long 7b,16b\n"
+               ".previous"
+               : "=&c"(size), "=&D" (d0), "=&S" (d1)
+               :  "1"(to), "2"(from), "0"(size)
+               : "eax", "edx", "memory");
+        return size;
+}
+static unsigned long __copy_user_intel_nocache(void *to,
+                                const void __user *from, unsigned long size)
+{
+        int d0, d1;
+        __asm__ __volatile__(
+               "        .align 2,0x90\n"
+               "0:      movl 32(%4), %%eax\n"
+               "        cmpl $67, %0\n"
+               "        jbe 2f\n"
+               "1:      movl 64(%4), %%eax\n"
+               "        .align 2,0x90\n"
+               "2:      movl 0(%4), %%eax\n"
+               "21:     movl 4(%4), %%edx\n"
+               "        movnti %%eax, 0(%3)\n"
+               "        movnti %%edx, 4(%3)\n"
+               "3:      movl 8(%4), %%eax\n"
+               "31:     movl 12(%4),%%edx\n"
+               "        movnti %%eax, 8(%3)\n"
+               "        movnti %%edx, 12(%3)\n"
+               "4:      movl 16(%4), %%eax\n"
+               "41:     movl 20(%4), %%edx\n"
+               "        movnti %%eax, 16(%3)\n"
+               "        movnti %%edx, 20(%3)\n"
+               "10:     movl 24(%4), %%eax\n"
+               "51:     movl 28(%4), %%edx\n"
+               "        movnti %%eax, 24(%3)\n"
+               "        movnti %%edx, 28(%3)\n"
+               "11:     movl 32(%4), %%eax\n"
+               "61:     movl 36(%4), %%edx\n"
+               "        movnti %%eax, 32(%3)\n"
+               "        movnti %%edx, 36(%3)\n"
+               "12:     movl 40(%4), %%eax\n"
+               "71:     movl 44(%4), %%edx\n"
+               "        movnti %%eax, 40(%3)\n"
+               "        movnti %%edx, 44(%3)\n"
+               "13:     movl 48(%4), %%eax\n"
+               "81:     movl 52(%4), %%edx\n"
+               "        movnti %%eax, 48(%3)\n"
+               "        movnti %%edx, 52(%3)\n"
+               "14:     movl 56(%4), %%eax\n"
+               "91:     movl 60(%4), %%edx\n"
+               "        movnti %%eax, 56(%3)\n"
+               "        movnti %%edx, 60(%3)\n"
+               "        addl $-64, %0\n"
+               "        addl $64, %4\n"
+               "        addl $64, %3\n"
+               "        cmpl $63, %0\n"
+               "        ja  0b\n"
+               "        sfence \n"
+               "5:      movl  %0, %%eax\n"
+               "        shrl  $2, %0\n"
+               "        andl $3, %%eax\n"
+               "        cld\n"
+               "6:      rep; movsl\n"
+               "        movl %%eax,%0\n"
+               "7:      rep; movsb\n"
+               "8:\n"
+               ".section .fixup,\"ax\"\n"
+               "9:      lea 0(%%eax,%0,4),%0\n"
+               "16:     jmp 8b\n"
+               ".previous\n"
+               ".section __ex_table,\"a\"\n"
+               "        .align 4\n"
+               "        .long 0b,16b\n"
+               "        .long 1b,16b\n"
+               "        .long 2b,16b\n"
+               "        .long 21b,16b\n"
+               "        .long 3b,16b\n"
+               "        .long 31b,16b\n"
+               "        .long 4b,16b\n"
+               "        .long 41b,16b\n"
+               "        .long 10b,16b\n"
+               "        .long 51b,16b\n"
+               "        .long 11b,16b\n"
+               "        .long 61b,16b\n"
+               "        .long 12b,16b\n"
+               "        .long 71b,16b\n"
+               "        .long 13b,16b\n"
+               "        .long 81b,16b\n"
+               "        .long 14b,16b\n"
+               "        .long 91b,16b\n"
+               "        .long 6b,9b\n"
+               "        .long 7b,16b\n"
+               ".previous"
+               : "=&c"(size), "=&D" (d0), "=&S" (d1)
+               :  "1"(to), "2"(from), "0"(size)
+               : "eax", "edx", "memory");
+        return size;
+}
+#else
+/*
+ * Leave these declared but undefined.  They should not be any references to
+ * them
+ */
+unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
+                                        unsigned long size);
+unsigned long __copy_user_intel(void __user *to, const void *from,
+                                        unsigned long size);
+unsigned long __copy_user_zeroing_intel_nocache(void *to,
+                                const void __user *from, unsigned long size);
+#endif /* CONFIG_X86_INTEL_USERCOPY */
+/* Generic arbitrary sized copy.  */
+#define __copy_user(to,from,size)                                       \
+do {                                                                    \
+        int __d0, __d1, __d2;                                           \
+        __asm__ __volatile__(                                           \
+                "       cmp  $7,%0\n"                                   \
+                "       jbe  1f\n"                                      \
+                "       movl %1,%0\n"                                   \
+                "       negl %0\n"                                      \
+                "       andl $7,%0\n"                                   \
+                "       subl %0,%3\n"                                   \
+                "4:     rep; movsb\n"                                   \
+                "       movl %3,%0\n"                                   \
+                "       shrl $2,%0\n"                                   \
+                "       andl $3,%3\n"                                   \
+                "       .align 2,0x90\n"                                \
+                "0:     rep; movsl\n"                                   \
+                "       movl %3,%0\n"                                   \
+                "1:     rep; movsb\n"                                   \
+                "2:\n"                                                  \
+                ".section .fixup,\"ax\"\n"                              \
+                "5:     addl %3,%0\n"                                   \
+                "       jmp 2b\n"                                       \
+                "3:     lea 0(%3,%0,4),%0\n"                            \
+                "       jmp 2b\n"                                       \
+                ".previous\n"                                           \
+                ".section __ex_table,\"a\"\n"                           \
+                "       .align 4\n"                                     \
+                "       .long 4b,5b\n"                                  \
+                "       .long 0b,3b\n"                                  \
+                "       .long 1b,2b\n"                                  \
+                ".previous"                                             \
+                : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)   \
+                : "3"(size), "0"(size), "1"(to), "2"(from)              \
+                : "memory");                                            \
+} while (0)
+#define __copy_user_zeroing(to,from,size)                               \
+do {                                                                    \
+        int __d0, __d1, __d2;                                           \
+        __asm__ __volatile__(                                           \
+                "       cmp  $7,%0\n"                                   \
+                "       jbe  1f\n"                                      \
+                "       movl %1,%0\n"                                   \
+                "       negl %0\n"                                      \
+                "       andl $7,%0\n"                                   \
+                "       subl %0,%3\n"                                   \
+                "4:     rep; movsb\n"                                   \
+                "       movl %3,%0\n"                                   \
+                "       shrl $2,%0\n"                                   \
+                "       andl $3,%3\n"                                   \
+                "       .align 2,0x90\n"                                \
+                "0:     rep; movsl\n"                                   \
+                "       movl %3,%0\n"                                   \
+                "1:     rep; movsb\n"                                   \
+                "2:\n"                                                  \
+                ".section .fixup,\"ax\"\n"                              \
+                "5:     addl %3,%0\n"                                   \
+                "       jmp 6f\n"                                       \
+                "3:     lea 0(%3,%0,4),%0\n"                            \
+                "6:     pushl %0\n"                                     \
+                "       pushl %%eax\n"                                  \
+                "       xorl %%eax,%%eax\n"                             \
+                "       rep; stosb\n"                                   \
+                "       popl %%eax\n"                                   \
+                "       popl %0\n"                                      \
+                "       jmp 2b\n"                                       \
+                ".previous\n"                                           \
+                ".section __ex_table,\"a\"\n"                           \
+                "       .align 4\n"                                     \
+                "       .long 4b,5b\n"                                  \
+                "       .long 0b,3b\n"                                  \
+                "       .long 1b,6b\n"                                  \
+                ".previous"                                             \
+                : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)   \
+                : "3"(size), "0"(size), "1"(to), "2"(from)              \
+                : "memory");                                            \
+} while (0)
+unsigned long __copy_to_user_ll(void __user *to, const void *from,
+                                unsigned long n)
+{
+#ifndef CONFIG_X86_WP_WORKS_OK
+        if (unlikely(boot_cpu_data.wp_works_ok == 0) &&
+                        ((unsigned long )to) < TASK_SIZE) {
+                /*
+                 * When we are in an atomic section (see
+                 * mm/filemap.c:file_read_actor), return the full
+                 * length to take the slow path.
+                 */
+                if (in_atomic())
+                        return n;
+                /* 
+                 * CPU does not honor the WP bit when writing
+                 * from supervisory mode, and due to preemption or SMP,
+                 * the page tables can change at any time.
+                 * Do it manually.      Manfred <manfred@colorfullife.com>
+                 */
+                while (n) {
+                        unsigned long offset = ((unsigned long)to)%PAGE_SIZE;
+                        unsigned long len = PAGE_SIZE - offset;
+                        int retval;
+                        struct page *pg;
+                        void *maddr;
+                        
+                        if (len > n)
+                                len = n;
+survive:
+                        down_read(&current->mm->mmap_sem);
+                        retval = get_user_pages(current, current->mm,
+                                        (unsigned long )to, 1, 1, 0, &pg, NULL);
+                        if (retval == -ENOMEM && is_init(current)) {
+                                up_read(&current->mm->mmap_sem);
+                                congestion_wait(WRITE, HZ/50);
+                                goto survive;
+                        }
+                        if (retval != 1) {
+                                up_read(&current->mm->mmap_sem);
+                                break;
+                        }
+                        maddr = kmap_atomic(pg, KM_USER0);
+                        memcpy(maddr + offset, from, len);
+                        kunmap_atomic(maddr, KM_USER0);
+                        set_page_dirty_lock(pg);
+                        put_page(pg);
+                        up_read(&current->mm->mmap_sem);
+                        from += len;
+                        to += len;
+                        n -= len;
+                }
+                return n;
+        }
+#endif
+        if (movsl_is_ok(to, from, n))
+                __copy_user(to, from, n);
+        else
+                n = __copy_user_intel(to, from, n);
+        return n;
+}
+EXPORT_SYMBOL(__copy_to_user_ll);
+unsigned long __copy_from_user_ll(void *to, const void __user *from,
+                                        unsigned long n)
+{
+        if (movsl_is_ok(to, from, n))
+                __copy_user_zeroing(to, from, n);
+        else
+                n = __copy_user_zeroing_intel(to, from, n);
+        return n;
+}
+EXPORT_SYMBOL(__copy_from_user_ll);
+unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
+                                         unsigned long n)
+{
+        if (movsl_is_ok(to, from, n))
+                __copy_user(to, from, n);
+        else
+                n = __copy_user_intel((void __user *)to,
+                                      (const void *)from, n);
+        return n;
+}
+EXPORT_SYMBOL(__copy_from_user_ll_nozero);
+unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
+                                        unsigned long n)
+{
+#ifdef CONFIG_X86_INTEL_USERCOPY
+        if ( n > 64 && cpu_has_xmm2)
+                n = __copy_user_zeroing_intel_nocache(to, from, n);
+        else
+                __copy_user_zeroing(to, from, n);
+#else
+        __copy_user_zeroing(to, from, n);
+#endif
+        return n;
+}
+unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
+                                        unsigned long n)
+{
+#ifdef CONFIG_X86_INTEL_USERCOPY
+        if ( n > 64 && cpu_has_xmm2)
+                n = __copy_user_intel_nocache(to, from, n);
+        else
+                __copy_user(to, from, n);
+#else
+        __copy_user(to, from, n);
+#endif
+        return n;
+}
+/**
+ * copy_to_user: - Copy a block of data into user space.
+ * @to:   Destination address, in user space.
+ * @from: Source address, in kernel space.
+ * @n:    Number of bytes to copy.
+ *
+ * Context: User context only.  This function may sleep.
+ *
+ * Copy data from kernel space to user space.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ */
+unsigned long
+copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+        if (access_ok(VERIFY_WRITE, to, n))
+                n = __copy_to_user(to, from, n);
+        return n;
+}
+EXPORT_SYMBOL(copy_to_user);
+/**
+ * copy_from_user: - Copy a block of data from user space.
+ * @to:   Destination address, in kernel space.
+ * @from: Source address, in user space.
+ * @n:    Number of bytes to copy.
+ *
+ * Context: User context only.  This function may sleep.
+ *
+ * Copy data from user space to kernel space.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ *
+ * If some data could not be copied, this function will pad the copied
+ * data to the requested size using zero bytes.
+ */
+unsigned long
+copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+        if (access_ok(VERIFY_READ, from, n))
+                n = __copy_from_user(to, from, n);
+        else
+                memset(to, 0, n);
+        return n;
+}
+EXPORT_SYMBOL(copy_from_user);
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
new file mode 100644
index 000000000000..893d43f838cc
--- /dev/null
+++ b/arch/x86/lib/usercopy_64.c
@@ -0,0 +1,166 @@
+/* 
+ * User address space access functions.
+ *
+ * Copyright 1997 Andi Kleen <ak@muc.de>
+ * Copyright 1997 Linus Torvalds
+ * Copyright 2002 Andi Kleen <ak@suse.de>
+ */
+#include <linux/module.h>
+#include <asm/uaccess.h>
+/*
+ * Copy a null terminated string from userspace.
+ */
+#define __do_strncpy_from_user(dst,src,count,res)                          \
+do {                                                                       \
+        long __d0, __d1, __d2;                                             \
+        might_sleep();                                                     \
+        __asm__ __volatile__(                                              \
+                "       testq %1,%1\n"                                     \
+                "       jz 2f\n"                                           \
+                "0:     lodsb\n"                                           \
+                "       stosb\n"                                           \
+                "       testb %%al,%%al\n"                                 \
+                "       jz 1f\n"                                           \
+                "       decq %1\n"                                         \
+                "       jnz 0b\n"                                          \
+                "1:     subq %1,%0\n"                                      \
+                "2:\n"                                                     \
+                ".section .fixup,\"ax\"\n"                                 \
+                "3:     movq %5,%0\n"                                      \
+                "       jmp 2b\n"                                          \
+                ".previous\n"                                              \
+                ".section __ex_table,\"a\"\n"                              \
+                "       .align 8\n"                                        \
+                "       .quad 0b,3b\n"                                     \
+                ".previous"                                                \
+                : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),      \
+                  "=&D" (__d2)                                             \
+                : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
+                : "memory");                                               \
+} while (0)
+long
+__strncpy_from_user(char *dst, const char __user *src, long count)
+{
+        long res;
+        __do_strncpy_from_user(dst, src, count, res);
+        return res;
+}
+EXPORT_SYMBOL(__strncpy_from_user);
+long
+strncpy_from_user(char *dst, const char __user *src, long count)
+{
+        long res = -EFAULT;
+        if (access_ok(VERIFY_READ, src, 1))
+                return __strncpy_from_user(dst, src, count);
+        return res;
+}
+EXPORT_SYMBOL(strncpy_from_user);
+/*
+ * Zero Userspace
+ */
+unsigned long __clear_user(void __user *addr, unsigned long size)
+{
+        long __d0;
+        might_sleep();
+        /* no memory constraint because it doesn't change any memory gcc knows
+           about */
+        asm volatile(
+                "       testq  %[size8],%[size8]\n"
+                "       jz     4f\n"
+                "0:     movq %[zero],(%[dst])\n"
+                "       addq   %[eight],%[dst]\n"
+                "       decl %%ecx ; jnz   0b\n"
+                "4:     movq  %[size1],%%rcx\n"
+                "       testl %%ecx,%%ecx\n"
+                "       jz     2f\n"
+                "1:     movb   %b[zero],(%[dst])\n"
+                "       incq   %[dst]\n"
+                "       decl %%ecx ; jnz  1b\n"
+                "2:\n"
+                ".section .fixup,\"ax\"\n"
+                "3:     lea 0(%[size1],%[size8],8),%[size8]\n"
+                "       jmp 2b\n"
+                ".previous\n"
+                ".section __ex_table,\"a\"\n"
+                "       .align 8\n"
+                "       .quad 0b,3b\n"
+                "       .quad 1b,2b\n"
+                ".previous"
+                : [size8] "=c"(size), [dst] "=&D" (__d0)
+                : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
+                  [zero] "r" (0UL), [eight] "r" (8UL));
+        return size;
+}
+EXPORT_SYMBOL(__clear_user);
+unsigned long clear_user(void __user *to, unsigned long n)
+{
+        if (access_ok(VERIFY_WRITE, to, n))
+                return __clear_user(to, n);
+        return n;
+}
+EXPORT_SYMBOL(clear_user);
+/*
+ * Return the size of a string (including the ending 0)
+ *
+ * Return 0 on exception, a value greater than N if too long
+ */
+long __strnlen_user(const char __user *s, long n)
+{
+        long res = 0;
+        char c;
+        while (1) {
+                if (res>n)
+                        return n+1;
+                if (__get_user(c, s))
+                        return 0;
+                if (!c)
+                        return res+1;
+                res++;
+                s++;
+        }
+}
+EXPORT_SYMBOL(__strnlen_user);
+long strnlen_user(const char __user *s, long n)
+{
+        if (!access_ok(VERIFY_READ, s, n))
+                return 0;
+        return __strnlen_user(s, n);
+}
+EXPORT_SYMBOL(strnlen_user);
+long strlen_user(const char __user *s)
+{
+        long res = 0;
+        char c;
+        for (;;) {
+                if (get_user(c, s))
+                        return 0;
+                if (!c)
+                        return res+1;
+                res++;
+                s++;
+        }
+}
+EXPORT_SYMBOL(strlen_user);
+unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
+{
+        if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { 
+                return copy_user_generic((__force void *)to, (__force void *)from, len);
+        } 
+        return len;             
+}
+EXPORT_SYMBOL(copy_in_user);
diff --git a/arch/x86/mach-default/Makefile b/arch/x86/mach-default/Makefile
new file mode 100644
index 000000000000..012fe34459e6
--- /dev/null
+++ b/arch/x86/mach-default/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the linux kernel.
+#
+obj-y                           := setup.o
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
new file mode 100644
index 000000000000..7f635c7a2381
--- /dev/null
+++ b/arch/x86/mach-default/setup.c
@@ -0,0 +1,180 @@
+/*
+ *      Machine specific setup for generic
+ */
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/arch_hooks.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#ifdef CONFIG_HOTPLUG_CPU
+#define DEFAULT_SEND_IPI        (1)
+#else
+#define DEFAULT_SEND_IPI        (0)
+#endif
+int no_broadcast=DEFAULT_SEND_IPI;
+/**
+ * pre_intr_init_hook - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ *      Perform any necessary interrupt initialisation prior to setting up
+ *      the "ordinary" interrupt call gates.  For legacy reasons, the ISA
+ *      interrupts should be initialised here if the machine emulates a PC
+ *      in any way.
+ **/
+void __init pre_intr_init_hook(void)
+{
+        init_ISA_irqs();
+}
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+/**
+ * intr_init_hook - post gate setup interrupt initialisation
+ *
+ * Description:
+ *      Fill in any interrupts that may have been left out by the general
+ *      init_IRQ() routine.  interrupts having to do with the machine rather
+ *      than the devices on the I/O bus (like APIC interrupts in intel MP
+ *      systems) are started here.
+ **/
+void __init intr_init_hook(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+        apic_intr_init();
+#endif
+        if (!acpi_ioapic)
+                setup_irq(2, &irq2);
+}
+/**
+ * pre_setup_arch_hook - hook called prior to any setup_arch() execution
+ *
+ * Description:
+ *      generally used to activate any machine specific identification
+ *      routines that may be needed before setup_arch() runs.  On VISWS
+ *      this is used to get the board revision and type.
+ **/
+void __init pre_setup_arch_hook(void)
+{
+}
+/**
+ * trap_init_hook - initialise system specific traps
+ *
+ * Description:
+ *      Called as the final act of trap_init().  Used in VISWS to initialise
+ *      the various board specific APIC traps.
+ **/
+void __init trap_init_hook(void)
+{
+}
+static struct irqaction irq0  = {
+        .handler = timer_interrupt,
+        .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
+        .mask = CPU_MASK_NONE,
+        .name = "timer"
+};
+/**
+ * time_init_hook - do any specific initialisations for the system timer.
+ *
+ * Description:
+ *      Must plug the system timer interrupt source at HZ into the IRQ listed
+ *      in irq_vectors.h:TIMER_IRQ
+ **/
+void __init time_init_hook(void)
+{
+        irq0.mask = cpumask_of_cpu(0);
+        setup_irq(0, &irq0);
+}
+#ifdef CONFIG_MCA
+/**
+ * mca_nmi_hook - hook into MCA specific NMI chain
+ *
+ * Description:
+ *      The MCA (Microchannel Arcitecture) has an NMI chain for NMI sources
+ *      along the MCA bus.  Use this to hook into that chain if you will need
+ *      it.
+ **/
+void mca_nmi_hook(void)
+{
+        /* If I recall correctly, there's a whole bunch of other things that
+         * we can do to check for NMI problems, but that's all I know about
+         * at the moment.
+         */
+        printk("NMI generated from unknown source!\n");
+}
+#endif
+static __init int no_ipi_broadcast(char *str)
+{
+        get_option(&str, &no_broadcast);
+        printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
+                                                                                        "IPI Broadcast");
+        return 1;
+}
+__setup("no_ipi_broadcast", no_ipi_broadcast);
+static int __init print_ipi_mode(void)
+{
+        printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
+                                                                                        "Shortcut");
+        return 0;
+}
+late_initcall(print_ipi_mode);
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ *
+ * Description:
+ *      This is included late in kernel/setup.c so that it can make
+ *      use of all of the static functions.
+ **/
+char * __init machine_specific_memory_setup(void)
+{
+        char *who;
+        who = "BIOS-e820";
+        /*
+         * Try to copy the BIOS-supplied E820-map.
+         *
+         * Otherwise fake a memory map; one section from 0k->640k,
+         * the next section from 1mb->appropriate_mem_k
+         */
+        sanitize_e820_map(E820_MAP, &E820_MAP_NR);
+        if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
+                unsigned long mem_size;
+                /* compare results from other methods and take the greater */
+                if (ALT_MEM_K < EXT_MEM_K) {
+                        mem_size = EXT_MEM_K;
+                        who = "BIOS-88";
+                } else {
+                        mem_size = ALT_MEM_K;
+                        who = "BIOS-e801";
+                }
+                e820.nr_map = 0;
+                add_memory_region(0, LOWMEMSIZE(), E820_RAM);
+                add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+        }
+        return who;
+}
diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile
new file mode 100644
index 000000000000..69dd4da218dc
--- /dev/null
+++ b/arch/x86/mach-es7000/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the linux kernel.
+#
+obj-$(CONFIG_X86_ES7000)        := es7000plat.o
+obj-$(CONFIG_X86_GENERICARCH)   := es7000plat.o
diff --git a/arch/x86/mach-es7000/es7000.h b/arch/x86/mach-es7000/es7000.h
new file mode 100644
index 000000000000..c8d5aa132fa0
--- /dev/null
+++ b/arch/x86/mach-es7000/es7000.h
@@ -0,0 +1,114 @@
+/*
+ * Written by: Garry Forsgren, Unisys Corporation
+ *             Natalie Protasevich, Unisys Corporation
+ * This file contains the code to configure and interface 
+ * with Unisys ES7000 series hardware system manager.
+ *
+ * Copyright (c) 2003 Unisys Corporation.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Unisys Corporation, Township Line & Union Meeting 
+ * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
+ *
+ * http://www.unisys.com
+ */
+/*
+ * ES7000 chipsets
+ */
+#define NON_UNISYS              0
+#define ES7000_CLASSIC          1
+#define ES7000_ZORRO            2
+#define MIP_REG                 1
+#define MIP_PSAI_REG            4
+#define MIP_BUSY                1
+#define MIP_SPIN                0xf0000
+#define MIP_VALID               0x0100000000000000ULL
+#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
+#define MIP_RD_LO(VALUE)        (VALUE & 0xffffffff)   
+struct mip_reg_info {
+        unsigned long long mip_info;
+        unsigned long long delivery_info;
+        unsigned long long host_reg;
+        unsigned long long mip_reg;
+};
+struct part_info {
+        unsigned char type;   
+        unsigned char length;
+        unsigned char part_id;
+        unsigned char apic_mode;
+        unsigned long snum;    
+        char ptype[16];
+        char sname[64];
+        char pname[64];
+};
+struct psai {
+        unsigned long long entry_type;
+        unsigned long long addr;
+        unsigned long long bep_addr;
+};
+struct es7000_mem_info {
+        unsigned char type;   
+        unsigned char length;
+        unsigned char resv[6];
+        unsigned long long  start; 
+        unsigned long long  size; 
+};
+struct es7000_oem_table {
+        unsigned long long hdr;
+        struct mip_reg_info mip;
+        struct part_info pif;
+        struct es7000_mem_info shm;
+        struct psai psai;
+};
+#ifdef CONFIG_ACPI
+struct oem_table {
+        struct acpi_table_header Header;
+        u32 OEMTableAddr;
+        u32 OEMTableSize;
+};
+extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
+#endif
+struct mip_reg {
+        unsigned long long off_0;
+        unsigned long long off_8;
+        unsigned long long off_10;
+        unsigned long long off_18;
+        unsigned long long off_20;
+        unsigned long long off_28;
+        unsigned long long off_30;
+        unsigned long long off_38;
+};
+#define MIP_SW_APIC             0x1020b
+#define MIP_FUNC(VALUE)         (VALUE & 0xff)
+extern int parse_unisys_oem (char *oemptr);
+extern void setup_unisys(void);
+extern int es7000_start_cpu(int cpu, unsigned long eip);
+extern void es7000_sw_apic(void);
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c
new file mode 100644
index 000000000000..ab99072d3f9a
--- /dev/null
+++ b/arch/x86/mach-es7000/es7000plat.c
@@ -0,0 +1,327 @@
+/*
+ * Written by: Garry Forsgren, Unisys Corporation
+ *             Natalie Protasevich, Unisys Corporation
+ * This file contains the code to configure and interface
+ * with Unisys ES7000 series hardware system manager.
+ *
+ * Copyright (c) 2003 Unisys Corporation.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Unisys Corporation, Township Line & Union Meeting
+ * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
+ *
+ * http://www.unisys.com
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <asm/io.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+#include <asm/apicdef.h>
+#include "es7000.h"
+#include <mach_mpparse.h>
+/*
+ * ES7000 Globals
+ */
+volatile unsigned long  *psai = NULL;
+struct mip_reg          *mip_reg;
+struct mip_reg          *host_reg;
+int                     mip_port;
+unsigned long           mip_addr, host_addr;
+/*
+ * GSI override for ES7000 platforms.
+ */
+static unsigned int base;
+static int
+es7000_rename_gsi(int ioapic, int gsi)
+{
+        if (es7000_plat == ES7000_ZORRO)
+                return gsi;
+        if (!base) {
+                int i;
+                for (i = 0; i < nr_ioapics; i++)
+                        base += nr_ioapic_registers[i];
+        }
+        if (!ioapic && (gsi < 16)) 
+                gsi += base;
+        return gsi;
+}
+void __init
+setup_unisys(void)
+{
+        /*
+         * Determine the generation of the ES7000 currently running.
+         *
+         * es7000_plat = 1 if the machine is a 5xx ES7000 box
+         * es7000_plat = 2 if the machine is a x86_64 ES7000 box
+         *
+         */
+        if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
+                es7000_plat = ES7000_ZORRO;
+        else
+                es7000_plat = ES7000_CLASSIC;
+        ioapic_renumber_irq = es7000_rename_gsi;
+}
+/*
+ * Parse the OEM Table
+ */
+int __init
+parse_unisys_oem (char *oemptr)
+{
+        int                     i;
+        int                     success = 0;
+        unsigned char           type, size;
+        unsigned long           val;
+        char                    *tp = NULL;
+        struct psai             *psaip = NULL;
+        struct mip_reg_info     *mi;
+        struct mip_reg          *host, *mip;
+        tp = oemptr;
+        tp += 8;
+        for (i=0; i <= 6; i++) {
+                type = *tp++;
+                size = *tp++;
+                tp -= 2;
+                switch (type) {
+                case MIP_REG:
+                        mi = (struct mip_reg_info *)tp;
+                        val = MIP_RD_LO(mi->host_reg);
+                        host_addr = val;
+                        host = (struct mip_reg *)val;
+                        host_reg = __va(host);
+                        val = MIP_RD_LO(mi->mip_reg);
+                        mip_port = MIP_PORT(mi->mip_info);
+                        mip_addr = val;
+                        mip = (struct mip_reg *)val;
+                        mip_reg = __va(mip);
+                        Dprintk("es7000_mipcfg: host_reg = 0x%lx \n",
+                                (unsigned long)host_reg);
+                        Dprintk("es7000_mipcfg: mip_reg = 0x%lx \n",
+                                (unsigned long)mip_reg);
+                        success++;
+                        break;
+                case MIP_PSAI_REG:
+                        psaip = (struct psai *)tp;
+                        if (tp != NULL) {
+                                if (psaip->addr)
+                                        psai = __va(psaip->addr);
+                                else
+                                        psai = NULL;
+                                success++;
+                        }
+                        break;
+                default:
+                        break;
+                }
+                tp += size;
+        }
+        if (success < 2) {
+                es7000_plat = NON_UNISYS;
+        } else
+                setup_unisys();
+        return es7000_plat;
+}
+#ifdef CONFIG_ACPI
+int __init
+find_unisys_acpi_oem_table(unsigned long *oem_addr)
+{
+        struct acpi_table_header *header = NULL;
+        int i = 0;
+        while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) {
+                if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
+                        struct oem_table *t = (struct oem_table *)header;
+                        *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr,
+                                                                    t->OEMTableSize);
+                        return 0;
+                }
+        }
+        return -1;
+}
+#endif
+/*
+ * This file also gets compiled if CONFIG_X86_GENERICARCH is set. Generic
+ * arch already has got following function definitions (asm-generic/es7000.c)
+ * hence no need to define these for that case.
+ */
+#ifndef CONFIG_X86_GENERICARCH
+void es7000_sw_apic(void);
+void __init enable_apic_mode(void)
+{
+        es7000_sw_apic();
+        return;
+}
+__init int mps_oem_check(struct mp_config_table *mpc, char *oem,
+                char *productid)
+{
+        if (mpc->mpc_oemptr) {
+                struct mp_config_oemtable *oem_table =
+                        (struct mp_config_oemtable *)mpc->mpc_oemptr;
+                if (!strncmp(oem, "UNISYS", 6))
+                        return parse_unisys_oem((char *)oem_table);
+        }
+        return 0;
+}
+#ifdef CONFIG_ACPI
+/* Hook from generic ACPI tables.c */
+int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+        unsigned long oem_addr;
+        if (!find_unisys_acpi_oem_table(&oem_addr)) {
+                if (es7000_check_dsdt())
+                        return parse_unisys_oem((char *)oem_addr);
+                else {
+                        setup_unisys();
+                        return 1;
+                }
+        }
+        return 0;
+}
+#else
+int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+        return 0;
+}
+#endif
+#endif /* COFIG_X86_GENERICARCH */
+static void
+es7000_spin(int n)
+{
+        int i = 0;
+        while (i++ < n)
+                rep_nop();
+}
+static int __init
+es7000_mip_write(struct mip_reg *mip_reg)
+{
+        int                     status = 0;
+        int                     spin;
+        spin = MIP_SPIN;
+        while (((unsigned long long)host_reg->off_38 &
+                (unsigned long long)MIP_VALID) != 0) {
+                        if (--spin <= 0) {
+                                printk("es7000_mip_write: Timeout waiting for Host Valid Flag");
+                                return -1;
+                        }
+                es7000_spin(MIP_SPIN);
+        }
+        memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
+        outb(1, mip_port);
+        spin = MIP_SPIN;
+        while (((unsigned long long)mip_reg->off_38 &
+                (unsigned long long)MIP_VALID) == 0) {
+                if (--spin <= 0) {
+                        printk("es7000_mip_write: Timeout waiting for MIP Valid Flag");
+                        return -1;
+                }
+                es7000_spin(MIP_SPIN);
+        }
+        status = ((unsigned long long)mip_reg->off_0 &
+                (unsigned long long)0xffff0000000000ULL) >> 48;
+        mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 &
+                (unsigned long long)~MIP_VALID);
+        return status;
+}
+int
+es7000_start_cpu(int cpu, unsigned long eip)
+{
+        unsigned long vect = 0, psaival = 0;
+        if (psai == NULL)
+                return -1;
+        vect = ((unsigned long)__pa(eip)/0x1000) << 16;
+        psaival = (0x1000000 | vect | cpu);
+        while (*psai & 0x1000000)
+                ;
+        *psai = psaival;
+        return 0;
+}
+int
+es7000_stop_cpu(int cpu)
+{
+        int startup;
+        if (psai == NULL)
+                return -1;
+        startup= (0x1000000 | cpu);
+        while ((*psai & 0xff00ffff) != startup)
+                ;
+        startup = (*psai & 0xff0000) >> 16;
+        *psai &= 0xffffff;
+        return 0;
+}
+void __init
+es7000_sw_apic()
+{
+        if (es7000_plat) {
+                int mip_status;
+                struct mip_reg es7000_mip_reg;
+                printk("ES7000: Enabling APIC mode.\n");
+                memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
+                es7000_mip_reg.off_0 = MIP_SW_APIC;
+                es7000_mip_reg.off_38 = (MIP_VALID);
+                while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
+                        printk("es7000_sw_apic: command failed, status = %x\n",
+                                mip_status);
+                return;
+        }
+}
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
new file mode 100644
index 000000000000..19d6d407737b
--- /dev/null
+++ b/arch/x86/mach-generic/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the generic architecture
+#
+EXTRA_CFLAGS    := -Iarch/x86/kernel
+obj-y           := probe.o summit.o bigsmp.o es7000.o default.o 
+obj-y           += ../../x86/mach-es7000/
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
new file mode 100644
index 000000000000..292a225edabe
--- /dev/null
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -0,0 +1,57 @@
+/* 
+ * APIC driver for "bigsmp" XAPIC machines with more than 8 virtual CPUs.
+ * Drives the local APIC in "clustered mode".
+ */
+#define APIC_DEFINITION 1
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/smp.h>
+#include <asm/mpspec.h>
+#include <asm/genapic.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <asm/mach-bigsmp/mach_apic.h>
+#include <asm/mach-bigsmp/mach_apicdef.h>
+#include <asm/mach-bigsmp/mach_ipi.h>
+#include <asm/mach-default/mach_mpparse.h>
+static int dmi_bigsmp; /* can be set by dmi scanners */
+static int hp_ht_bigsmp(const struct dmi_system_id *d)
+{
+#ifdef CONFIG_X86_GENERICARCH
+        printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
+        dmi_bigsmp = 1;
+#endif
+        return 0;
+}
+static const struct dmi_system_id bigsmp_dmi_table[] = {
+        { hp_ht_bigsmp, "HP ProLiant DL760 G2", {
+                DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+                DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
+        }},
+        { hp_ht_bigsmp, "HP ProLiant DL740", {
+                DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+                DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
+         }},
+         { }
+};
+static int probe_bigsmp(void)
+{ 
+        if (def_to_bigsmp)
+                dmi_bigsmp = 1;
+        else
+                dmi_check_system(bigsmp_dmi_table);
+        return dmi_bigsmp; 
+} 
+struct genapic apic_bigsmp = APIC_INIT("bigsmp", probe_bigsmp); 
diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c
new file mode 100644
index 000000000000..8685208d8512
--- /dev/null
+++ b/arch/x86/mach-generic/default.c
@@ -0,0 +1,26 @@
+/* 
+ * Default generic APIC driver. This handles upto 8 CPUs.
+ */
+#define APIC_DEFINITION 1
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+#include <asm/mach-default/mach_apicdef.h>
+#include <asm/genapic.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <asm/mach-default/mach_apic.h>
+#include <asm/mach-default/mach_ipi.h>
+#include <asm/mach-default/mach_mpparse.h>
+/* should be called last. */
+static int probe_default(void)
+{ 
+        return 1;
+} 
+struct genapic apic_default = APIC_INIT("default", probe_default); 
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
new file mode 100644
index 000000000000..4742626f08c4
--- /dev/null
+++ b/arch/x86/mach-generic/es7000.c
@@ -0,0 +1,69 @@
+/*
+ * APIC driver for the Unisys ES7000 chipset.
+ */
+#define APIC_DEFINITION 1
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/smp.h>
+#include <asm/mpspec.h>
+#include <asm/genapic.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <asm/mach-es7000/mach_apicdef.h>
+#include <asm/mach-es7000/mach_apic.h>
+#include <asm/mach-es7000/mach_ipi.h>
+#include <asm/mach-es7000/mach_mpparse.h>
+#include <asm/mach-es7000/mach_wakecpu.h>
+static int probe_es7000(void)
+{
+        /* probed later in mptable/ACPI hooks */
+        return 0;
+}
+extern void es7000_sw_apic(void);
+static void __init enable_apic_mode(void)
+{
+        es7000_sw_apic();
+        return;
+}
+static __init int mps_oem_check(struct mp_config_table *mpc, char *oem,
+                char *productid)
+{
+        if (mpc->mpc_oemptr) {
+                struct mp_config_oemtable *oem_table =
+                        (struct mp_config_oemtable *)mpc->mpc_oemptr;
+                if (!strncmp(oem, "UNISYS", 6))
+                        return parse_unisys_oem((char *)oem_table);
+        }
+        return 0;
+}
+#ifdef CONFIG_ACPI
+/* Hook from generic ACPI tables.c */
+static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+        unsigned long oem_addr;
+        if (!find_unisys_acpi_oem_table(&oem_addr)) {
+                if (es7000_check_dsdt())
+                        return parse_unisys_oem((char *)oem_addr);
+                else {
+                        setup_unisys();
+                        return 1;
+                }
+        }
+        return 0;
+}
+#else
+static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+        return 0;
+}
+#endif
+struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c
new file mode 100644
index 000000000000..74f3da634423
--- /dev/null
+++ b/arch/x86/mach-generic/probe.c
@@ -0,0 +1,125 @@
+/* Copyright 2003 Andi Kleen, SuSE Labs. 
+ * Subject to the GNU Public License, v.2 
+ * 
+ * Generic x86 APIC driver probe layer.
+ */  
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/genapic.h>
+extern struct genapic apic_summit;
+extern struct genapic apic_bigsmp;
+extern struct genapic apic_es7000;
+extern struct genapic apic_default;
+struct genapic *genapic = &apic_default;
+struct genapic *apic_probe[] __initdata = { 
+        &apic_summit,
+        &apic_bigsmp, 
+        &apic_es7000,
+        &apic_default,  /* must be last */
+        NULL,
+};
+static int cmdline_apic __initdata;
+static int __init parse_apic(char *arg)
+{
+        int i;
+        if (!arg)
+                return -EINVAL;
+        for (i = 0; apic_probe[i]; i++) {
+                if (!strcmp(apic_probe[i]->name, arg)) {
+                        genapic = apic_probe[i];
+                        cmdline_apic = 1;
+                        return 0;
+                }
+        }
+        /* Parsed again by __setup for debug/verbose */
+        return 0;
+}
+early_param("apic", parse_apic);
+void __init generic_bigsmp_probe(void)
+{
+        /*
+         * This routine is used to switch to bigsmp mode when
+         * - There is no apic= option specified by the user
+         * - generic_apic_probe() has choosen apic_default as the sub_arch
+         * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
+         */
+        if (!cmdline_apic && genapic == &apic_default)
+                if (apic_bigsmp.probe()) {
+                        genapic = &apic_bigsmp;
+                        printk(KERN_INFO "Overriding APIC driver with %s\n",
+                               genapic->name);
+                }
+}
+void __init generic_apic_probe(void)
+{ 
+        if (!cmdline_apic) {
+                int i;
+                for (i = 0; apic_probe[i]; i++) {
+                        if (apic_probe[i]->probe()) {
+                                genapic = apic_probe[i];
+                                break;
+                        }
+                }
+                /* Not visible without early console */
+                if (!apic_probe[i])
+                        panic("Didn't find an APIC driver");
+        }
+        printk(KERN_INFO "Using APIC driver %s\n", genapic->name);
+} 
+/* These functions can switch the APIC even after the initial ->probe() */
+int __init mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid)
+{ 
+        int i;
+        for (i = 0; apic_probe[i]; ++i) { 
+                if (apic_probe[i]->mps_oem_check(mpc,oem,productid)) { 
+                        if (!cmdline_apic) {
+                                genapic = apic_probe[i];
+                                printk(KERN_INFO "Switched to APIC driver `%s'.\n",
+                                       genapic->name);
+                        }
+                        return 1;
+                } 
+        } 
+        return 0;
+} 
+int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+        int i;
+        for (i = 0; apic_probe[i]; ++i) { 
+                if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { 
+                        if (!cmdline_apic) {
+                                genapic = apic_probe[i];
+                                printk(KERN_INFO "Switched to APIC driver `%s'.\n",
+                                       genapic->name);
+                        }
+                        return 1;
+                } 
+        } 
+        return 0;       
+}
+int hard_smp_processor_id(void)
+{
+        return genapic->get_apic_id(*(unsigned long *)(APIC_BASE+APIC_ID));
+}
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
new file mode 100644
index 000000000000..74883ccb8f73
--- /dev/null
+++ b/arch/x86/mach-generic/summit.c
@@ -0,0 +1,27 @@
+/* 
+ * APIC driver for the IBM "Summit" chipset.
+ */
+#define APIC_DEFINITION 1
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/smp.h>
+#include <asm/mpspec.h>
+#include <asm/genapic.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <asm/mach-summit/mach_apic.h>
+#include <asm/mach-summit/mach_apicdef.h>
+#include <asm/mach-summit/mach_ipi.h>
+#include <asm/mach-summit/mach_mpparse.h>
+static int probe_summit(void)
+{ 
+        /* probed later in mptable/ACPI hooks */
+        return 0;
+} 
+struct genapic apic_summit = APIC_INIT("summit", probe_summit); 
diff --git a/arch/x86/mach-visws/Makefile b/arch/x86/mach-visws/Makefile
new file mode 100644
index 000000000000..835fd96ad768
--- /dev/null
+++ b/arch/x86/mach-visws/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the linux kernel.
+#
+obj-y                           := setup.o traps.o reboot.o
+obj-$(CONFIG_X86_VISWS_APIC)    += visws_apic.o
+obj-$(CONFIG_X86_LOCAL_APIC)    += mpparse.o
diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c
new file mode 100644
index 000000000000..f3c74fab8b95
--- /dev/null
+++ b/arch/x86/mach-visws/mpparse.c
@@ -0,0 +1,101 @@
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <asm/smp.h>
+#include <asm/io.h>
+#include "cobalt.h"
+#include "mach_apic.h"
+/* Have we found an MP table */
+int smp_found_config;
+/*
+ * Various Linux-internal data structures created from the
+ * MP-table.
+ */
+int apic_version [MAX_APICS];
+int pic_mode;
+unsigned long mp_lapic_addr;
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map;
+unsigned int __initdata maxcpus = NR_CPUS;
+/*
+ * The Visual Workstation is Intel MP compliant in the hardware
+ * sense, but it doesn't have a BIOS(-configuration table).
+ * No problem for Linux.
+ */
+static void __init MP_processor_info (struct mpc_config_processor *m)
+{
+        int ver, logical_apicid;
+        physid_mask_t apic_cpus;
+        
+        if (!(m->mpc_cpuflag & CPU_ENABLED))
+                return;
+        logical_apicid = m->mpc_apicid;
+        printk(KERN_INFO "%sCPU #%d %ld:%ld APIC version %d\n",
+                m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
+                m->mpc_apicid,
+                (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+                (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+                m->mpc_apicver);
+        if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
+                boot_cpu_physical_apicid = m->mpc_apicid;
+        ver = m->mpc_apicver;
+        if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) {
+                printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
+                        m->mpc_apicid, MAX_APICS);
+                return;
+        }
+        apic_cpus = apicid_to_cpu_present(m->mpc_apicid);
+        physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
+        /*
+         * Validate version
+         */
+        if (ver == 0x0) {
+                printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
+                        "fixing up to 0x10. (tell your hw vendor)\n",
+                        m->mpc_apicid);
+                ver = 0x10;
+        }
+        apic_version[m->mpc_apicid] = ver;
+}
+void __init find_smp_config(void)
+{
+        struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
+        unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
+        if (ncpus > CO_CPU_MAX) {
+                printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
+                        ncpus, mp);
+                ncpus = CO_CPU_MAX;
+        }
+        if (ncpus > maxcpus)
+                ncpus = maxcpus;
+        smp_found_config = 1;
+        while (ncpus--)
+                MP_processor_info(mp++);
+        mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+}
+void __init get_smp_config (void)
+{
+}
diff --git a/arch/x86/mach-visws/reboot.c b/arch/x86/mach-visws/reboot.c
new file mode 100644
index 000000000000..99332abfad42
--- /dev/null
+++ b/arch/x86/mach-visws/reboot.c
@@ -0,0 +1,55 @@
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/delay.h>
+#include <asm/io.h>
+#include "piix4.h"
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+void machine_shutdown(void)
+{
+#ifdef CONFIG_SMP
+        smp_send_stop();
+#endif
+}
+void machine_emergency_restart(void)
+{
+        /*
+         * Visual Workstations restart after this
+         * register is poked on the PIIX4
+         */
+        outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
+}
+void machine_restart(char * __unused)
+{
+        machine_shutdown();
+        machine_emergency_restart();
+}
+void machine_power_off(void)
+{
+        unsigned short pm_status;
+        extern unsigned int pci_bus0;
+        while ((pm_status = inw(PMSTS_PORT)) & 0x100)
+                outw(pm_status, PMSTS_PORT);
+        outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
+        mdelay(10);
+#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
+        (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
+        outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8);
+        outl(PIIX_SPECIAL_STOP, 0xCFC);
+}
+void machine_halt(void)
+{
+}
diff --git a/arch/x86/mach-visws/setup.c b/arch/x86/mach-visws/setup.c
new file mode 100644
index 000000000000..1f81f10e03a0
--- /dev/null
+++ b/arch/x86/mach-visws/setup.c
@@ -0,0 +1,183 @@
+/*
+ *  Unmaintained SGI Visual Workstation support.
+ *  Split out from setup.c by davej@suse.de
+ */
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <asm/fixmap.h>
+#include <asm/arch_hooks.h>
+#include <asm/io.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include "cobalt.h"
+#include "piix4.h"
+int no_broadcast;
+char visws_board_type = -1;
+char visws_board_rev = -1;
+void __init visws_get_board_type_and_rev(void)
+{
+        int raw;
+        visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
+                                                         >> PIIX_GPI_BD_SHIFT;
+        /*
+         * Get Board rev.
+         * First, we have to initialize the 307 part to allow us access
+         * to the GPIO registers.  Let's map them at 0x0fc0 which is right
+         * after the PIIX4 PM section.
+         */
+        outb_p(SIO_DEV_SEL, SIO_INDEX);
+        outb_p(SIO_GP_DEV, SIO_DATA);   /* Talk to GPIO regs. */
+        outb_p(SIO_DEV_MSB, SIO_INDEX);
+        outb_p(SIO_GP_MSB, SIO_DATA);   /* MSB of GPIO base address */
+        outb_p(SIO_DEV_LSB, SIO_INDEX);
+        outb_p(SIO_GP_LSB, SIO_DATA);   /* LSB of GPIO base address */
+        outb_p(SIO_DEV_ENB, SIO_INDEX);
+        outb_p(1, SIO_DATA);            /* Enable GPIO registers. */
+        /*
+         * Now, we have to map the power management section to write
+         * a bit which enables access to the GPIO registers.
+         * What lunatic came up with this shit?
+         */
+        outb_p(SIO_DEV_SEL, SIO_INDEX);
+        outb_p(SIO_PM_DEV, SIO_DATA);   /* Talk to GPIO regs. */
+        outb_p(SIO_DEV_MSB, SIO_INDEX);
+        outb_p(SIO_PM_MSB, SIO_DATA);   /* MSB of PM base address */
+        outb_p(SIO_DEV_LSB, SIO_INDEX);
+        outb_p(SIO_PM_LSB, SIO_DATA);   /* LSB of PM base address */
+        outb_p(SIO_DEV_ENB, SIO_INDEX);
+        outb_p(1, SIO_DATA);            /* Enable PM registers. */
+        /*
+         * Now, write the PM register which enables the GPIO registers.
+         */
+        outb_p(SIO_PM_FER2, SIO_PM_INDEX);
+        outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
+        /*
+         * Now, initialize the GPIO registers.
+         * We want them all to be inputs which is the
+         * power on default, so let's leave them alone.
+         * So, let's just read the board rev!
+         */
+        raw = inb_p(SIO_GP_DATA1);
+        raw &= 0x7f;    /* 7 bits of valid board revision ID. */
+        if (visws_board_type == VISWS_320) {
+                if (raw < 0x6) {
+                        visws_board_rev = 4;
+                } else if (raw < 0xc) {
+                        visws_board_rev = 5;
+                } else {
+                        visws_board_rev = 6;
+                }
+        } else if (visws_board_type == VISWS_540) {
+                        visws_board_rev = 2;
+                } else {
+                        visws_board_rev = raw;
+                }
+        printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
+               (visws_board_type == VISWS_320 ? "320" :
+               (visws_board_type == VISWS_540 ? "540" :
+                "unknown")), visws_board_rev);
+}
+void __init pre_intr_init_hook(void)
+{
+        init_VISWS_APIC_irqs();
+}
+void __init intr_init_hook(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+        apic_intr_init();
+#endif
+}
+void __init pre_setup_arch_hook()
+{
+        visws_get_board_type_and_rev();
+}
+static struct irqaction irq0 = {
+        .handler =      timer_interrupt,
+        .flags =        IRQF_DISABLED | IRQF_IRQPOLL,
+        .name =         "timer",
+};
+void __init time_init_hook(void)
+{
+        printk(KERN_INFO "Starting Cobalt Timer system clock\n");
+        /* Set the countdown value */
+        co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
+        /* Start the timer */
+        co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
+        /* Enable (unmask) the timer interrupt */
+        co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
+        /* Wire cpu IDT entry to s/w handler (and Cobalt APIC to IDT) */
+        setup_irq(0, &irq0);
+}
+/* Hook for machine specific memory setup. */
+#define MB (1024 * 1024)
+unsigned long sgivwfb_mem_phys;
+unsigned long sgivwfb_mem_size;
+EXPORT_SYMBOL(sgivwfb_mem_phys);
+EXPORT_SYMBOL(sgivwfb_mem_size);
+long long mem_size __initdata = 0;
+char * __init machine_specific_memory_setup(void)
+{
+        long long gfx_mem_size = 8 * MB;
+        mem_size = ALT_MEM_K;
+        if (!mem_size) {
+                printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
+                mem_size = 128 * MB;
+        }
+        /*
+         * this hardcodes the graphics memory to 8 MB
+         * it really should be sized dynamically (or at least
+         * set as a boot param)
+         */
+        if (!sgivwfb_mem_size) {
+                printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
+                sgivwfb_mem_size = 8 * MB;
+        }
+        /*
+         * Trim to nearest MB
+         */
+        sgivwfb_mem_size &= ~((1 << 20) - 1);
+        sgivwfb_mem_phys = mem_size - gfx_mem_size;
+        add_memory_region(0, LOWMEMSIZE(), E820_RAM);
+        add_memory_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
+        add_memory_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
+        return "PROM";
+}
diff --git a/arch/x86/mach-visws/traps.c b/arch/x86/mach-visws/traps.c
new file mode 100644
index 000000000000..843b67acf43b
--- /dev/null
+++ b/arch/x86/mach-visws/traps.c
@@ -0,0 +1,68 @@
+/* VISWS traps */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <asm/io.h>
+#include <asm/arch_hooks.h>
+#include <asm/apic.h>
+#include "cobalt.h"
+#include "lithium.h"
+#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
+#define BCD (LI_INTB | LI_INTC | LI_INTD)
+#define ALLDEVS (A01234 | BCD)
+static __init void lithium_init(void)
+{
+        set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
+        set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
+        if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
+            (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
+                printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
+                panic("This machine is not SGI Visual Workstation 320/540");
+        }
+        if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
+            (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
+                printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
+                panic("This machine is not SGI Visual Workstation 320/540");
+        }
+        li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
+        li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
+}
+static __init void cobalt_init(void)
+{
+        /*
+         * On normal SMP PC this is used only with SMP, but we have to
+         * use it and set it up here to start the Cobalt clock
+         */
+        set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
+        setup_local_APIC();
+        printk(KERN_INFO "Local APIC Version %#lx, ID %#lx\n",
+                apic_read(APIC_LVR), apic_read(APIC_ID));
+        set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
+        set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
+        printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
+                co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
+        /* Enable Cobalt APIC being careful to NOT change the ID! */
+        co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
+        printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
+                co_apic_read(CO_APIC_ID));
+}
+void __init trap_init_hook(void)
+{
+        lithium_init();
+        cobalt_init();
+}
diff --git a/arch/x86/mach-visws/visws_apic.c b/arch/x86/mach-visws/visws_apic.c
new file mode 100644
index 000000000000..710faf71a650
--- /dev/null
+++ b/arch/x86/mach-visws/visws_apic.c
@@ -0,0 +1,299 @@
+/*
+ *      linux/arch/i386/mach-visws/visws_apic.c
+ *
+ *      Copyright (C) 1999 Bent Hagemark, Ingo Molnar
+ *
+ *  SGI Visual Workstation interrupt controller
+ *
+ *  The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
+ *  which serves as the main interrupt controller in the system.  Non-legacy
+ *  hardware in the system uses this controller directly.  Legacy devices
+ *  are connected to the PIIX4 which in turn has its 8259(s) connected to
+ *  a of the Cobalt APIC entry.
+ *
+ *  09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
+ *
+ *  25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
+ */
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+#include "cobalt.h"
+#include "irq_vectors.h"
+static DEFINE_SPINLOCK(cobalt_lock);
+/*
+ * Set the given Cobalt APIC Redirection Table entry to point
+ * to the given IDT vector/index.
+ */
+static inline void co_apic_set(int entry, int irq)
+{
+        co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
+        co_apic_write(CO_APIC_HI(entry), 0);
+}
+/*
+ * Cobalt (IO)-APIC functions to handle PCI devices.
+ */
+static inline int co_apic_ide0_hack(void)
+{
+        extern char visws_board_type;
+        extern char visws_board_rev;
+        if (visws_board_type == VISWS_320 && visws_board_rev == 5)
+                return 5;
+        return CO_APIC_IDE0;
+}
+static int is_co_apic(unsigned int irq)
+{
+        if (IS_CO_APIC(irq))
+                return CO_APIC(irq);
+        switch (irq) {
+                case 0: return CO_APIC_CPU;
+                case CO_IRQ_IDE0: return co_apic_ide0_hack();
+                case CO_IRQ_IDE1: return CO_APIC_IDE1;
+                default: return -1;
+        }
+}
+/*
+ * This is the SGI Cobalt (IO-)APIC:
+ */
+static void enable_cobalt_irq(unsigned int irq)
+{
+        co_apic_set(is_co_apic(irq), irq);
+}
+static void disable_cobalt_irq(unsigned int irq)
+{
+        int entry = is_co_apic(irq);
+        co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
+        co_apic_read(CO_APIC_LO(entry));
+}
+/*
+ * "irq" really just serves to identify the device.  Here is where we
+ * map this to the Cobalt APIC entry where it's physically wired.
+ * This is called via request_irq -> setup_irq -> irq_desc->startup()
+ */
+static unsigned int startup_cobalt_irq(unsigned int irq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&cobalt_lock, flags);
+        if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
+                irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
+        enable_cobalt_irq(irq);
+        spin_unlock_irqrestore(&cobalt_lock, flags);
+        return 0;
+}
+static void ack_cobalt_irq(unsigned int irq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&cobalt_lock, flags);
+        disable_cobalt_irq(irq);
+        apic_write(APIC_EOI, APIC_EIO_ACK);
+        spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+static void end_cobalt_irq(unsigned int irq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&cobalt_lock, flags);
+        if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
+                enable_cobalt_irq(irq);
+        spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+static struct irq_chip cobalt_irq_type = {
+        .typename =     "Cobalt-APIC",
+        .startup =      startup_cobalt_irq,
+        .shutdown =     disable_cobalt_irq,
+        .enable =       enable_cobalt_irq,
+        .disable =      disable_cobalt_irq,
+        .ack =          ack_cobalt_irq,
+        .end =          end_cobalt_irq,
+};
+/*
+ * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
+ * -- not the manner expected by the code in i8259.c.
+ *
+ * there is a 'master' physical interrupt source that gets sent to
+ * the CPU. But in the chipset there are various 'virtual' interrupts
+ * waiting to be handled. We represent this to Linux through a 'master'
+ * interrupt controller type, and through a special virtual interrupt-
+ * controller. Device drivers only see the virtual interrupt sources.
+ */
+static unsigned int startup_piix4_master_irq(unsigned int irq)
+{
+        init_8259A(0);
+        return startup_cobalt_irq(irq);
+}
+static void end_piix4_master_irq(unsigned int irq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&cobalt_lock, flags);
+        enable_cobalt_irq(irq);
+        spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+static struct irq_chip piix4_master_irq_type = {
+        .typename =     "PIIX4-master",
+        .startup =      startup_piix4_master_irq,
+        .ack =          ack_cobalt_irq,
+        .end =          end_piix4_master_irq,
+};
+static struct irq_chip piix4_virtual_irq_type = {
+        .typename =     "PIIX4-virtual",
+        .shutdown =     disable_8259A_irq,
+        .enable =       enable_8259A_irq,
+        .disable =      disable_8259A_irq,
+};
+/*
+ * PIIX4-8259 master/virtual functions to handle interrupt requests
+ * from legacy devices: floppy, parallel, serial, rtc.
+ *
+ * None of these get Cobalt APIC entries, neither do they have IDT
+ * entries. These interrupts are purely virtual and distributed from
+ * the 'master' interrupt source: CO_IRQ_8259.
+ *
+ * When the 8259 interrupts its handler figures out which of these
+ * devices is interrupting and dispatches to its handler.
+ *
+ * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
+ * enable_irq gets the right irq. This 'master' irq is never directly
+ * manipulated by any driver.
+ */
+static irqreturn_t piix4_master_intr(int irq, void *dev_id)
+{
+        int realirq;
+        irq_desc_t *desc;
+        unsigned long flags;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        /* Find out what's interrupting in the PIIX4 master 8259 */
+        outb(0x0c, 0x20);               /* OCW3 Poll command */
+        realirq = inb(0x20);
+        /*
+         * Bit 7 == 0 means invalid/spurious
+         */
+        if (unlikely(!(realirq & 0x80)))
+                goto out_unlock;
+        realirq &= 7;
+        if (unlikely(realirq == 2)) {
+                outb(0x0c, 0xa0);
+                realirq = inb(0xa0);
+                if (unlikely(!(realirq & 0x80)))
+                        goto out_unlock;
+                realirq = (realirq & 7) + 8;
+        }
+        /* mask and ack interrupt */
+        cached_irq_mask |= 1 << realirq;
+        if (unlikely(realirq > 7)) {
+                inb(0xa1);
+                outb(cached_slave_mask, 0xa1);
+                outb(0x60 + (realirq & 7), 0xa0);
+                outb(0x60 + 2, 0x20);
+        } else {
+                inb(0x21);
+                outb(cached_master_mask, 0x21);
+                outb(0x60 + realirq, 0x20);
+        }
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+        desc = irq_desc + realirq;
+        /*
+         * handle this 'virtual interrupt' as a Cobalt one now.
+         */
+        kstat_cpu(smp_processor_id()).irqs[realirq]++;
+        if (likely(desc->action != NULL))
+                handle_IRQ_event(realirq, desc->action);
+        if (!(desc->status & IRQ_DISABLED))
+                enable_8259A_irq(realirq);
+        return IRQ_HANDLED;
+out_unlock:
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+        return IRQ_NONE;
+}
+static struct irqaction master_action = {
+        .handler =      piix4_master_intr,
+        .name =         "PIIX4-8259",
+};
+static struct irqaction cascade_action = {
+        .handler =      no_action,
+        .name =         "cascade",
+};
+void init_VISWS_APIC_irqs(void)
+{
+        int i;
+        for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
+                irq_desc[i].status = IRQ_DISABLED;
+                irq_desc[i].action = 0;
+                irq_desc[i].depth = 1;
+                if (i == 0) {
+                        irq_desc[i].chip = &cobalt_irq_type;
+                }
+                else if (i == CO_IRQ_IDE0) {
+                        irq_desc[i].chip = &cobalt_irq_type;
+                }
+                else if (i == CO_IRQ_IDE1) {
+                        irq_desc[i].chip = &cobalt_irq_type;
+                }
+                else if (i == CO_IRQ_8259) {
+                        irq_desc[i].chip = &piix4_master_irq_type;
+                }
+                else if (i < CO_IRQ_APIC0) {
+                        irq_desc[i].chip = &piix4_virtual_irq_type;
+                }
+                else if (IS_CO_APIC(i)) {
+                        irq_desc[i].chip = &cobalt_irq_type;
+                }
+        }
+        setup_irq(CO_IRQ_8259, &master_action);
+        setup_irq(2, &cascade_action);
+}
diff --git a/arch/x86/mach-voyager/Makefile b/arch/x86/mach-voyager/Makefile
new file mode 100644
index 000000000000..15c250b371d3
--- /dev/null
+++ b/arch/x86/mach-voyager/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the linux kernel.
+#
+EXTRA_CFLAGS    := -Iarch/x86/kernel
+obj-y                   := setup.o voyager_basic.o voyager_thread.o
+obj-$(CONFIG_SMP)       += voyager_smp.o voyager_cat.o
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
new file mode 100644
index 000000000000..2b55694e6400
--- /dev/null
+++ b/arch/x86/mach-voyager/setup.c
@@ -0,0 +1,125 @@
+/*
+ *      Machine specific setup for generic
+ */
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <asm/arch_hooks.h>
+#include <asm/voyager.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+void __init pre_intr_init_hook(void)
+{
+        init_ISA_irqs();
+}
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+void __init intr_init_hook(void)
+{
+#ifdef CONFIG_SMP
+        smp_intr_init();
+#endif
+        setup_irq(2, &irq2);
+}
+void __init pre_setup_arch_hook(void)
+{
+        /* Voyagers run their CPUs from independent clocks, so disable
+         * the TSC code because we can't sync them */
+        tsc_disable = 1;
+}
+void __init trap_init_hook(void)
+{
+}
+static struct irqaction irq0  = {
+        .handler = timer_interrupt,
+        .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
+        .mask = CPU_MASK_NONE,
+        .name = "timer"
+};
+void __init time_init_hook(void)
+{
+        irq0.mask = cpumask_of_cpu(safe_smp_processor_id());
+        setup_irq(0, &irq0);
+}
+/* Hook for machine specific memory setup. */
+char * __init machine_specific_memory_setup(void)
+{
+        char *who;
+        who = "NOT VOYAGER";
+        if(voyager_level == 5) {
+                __u32 addr, length;
+                int i;
+                who = "Voyager-SUS";
+                e820.nr_map = 0;
+                for(i=0; voyager_memory_detect(i, &addr, &length); i++) {
+                        add_memory_region(addr, length, E820_RAM);
+                }
+                return who;
+        } else if(voyager_level == 4) {
+                __u32 tom;
+                __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8;
+                /* select the DINO config space */
+                outb(VOYAGER_DINO, VOYAGER_CAT_CONFIG_PORT);
+                /* Read DINO top of memory register */
+                tom = ((inb(catbase + 0x4) & 0xf0) << 16)
+                        + ((inb(catbase + 0x5) & 0x7f) << 24);
+                if(inb(catbase) != VOYAGER_DINO) {
+                        printk(KERN_ERR "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n");
+                        tom = (EXT_MEM_K)<<10;
+                }
+                who = "Voyager-TOM";
+                add_memory_region(0, 0x9f000, E820_RAM);
+                /* map from 1M to top of memory */
+                add_memory_region(1*1024*1024, tom - 1*1024*1024, E820_RAM);
+                /* FIXME: Should check the ASICs to see if I need to
+                 * take out the 8M window.  Just do it at the moment
+                 * */
+                add_memory_region(8*1024*1024, 8*1024*1024, E820_RESERVED);
+                return who;
+        }
+        who = "BIOS-e820";
+        /*
+         * Try to copy the BIOS-supplied E820-map.
+         *
+         * Otherwise fake a memory map; one section from 0k->640k,
+         * the next section from 1mb->appropriate_mem_k
+         */
+        sanitize_e820_map(E820_MAP, &E820_MAP_NR);
+        if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
+                unsigned long mem_size;
+                /* compare results from other methods and take the greater */
+                if (ALT_MEM_K < EXT_MEM_K) {
+                        mem_size = EXT_MEM_K;
+                        who = "BIOS-88";
+                } else {
+                        mem_size = ALT_MEM_K;
+                        who = "BIOS-e801";
+                }
+                e820.nr_map = 0;
+                add_memory_region(0, LOWMEMSIZE(), E820_RAM);
+                add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+        }
+        return who;
+}
diff --git a/arch/x86/mach-voyager/voyager_basic.c b/arch/x86/mach-voyager/voyager_basic.c
new file mode 100644
index 000000000000..9b77b39b71a6
--- /dev/null
+++ b/arch/x86/mach-voyager/voyager_basic.c
@@ -0,0 +1,331 @@
+/* Copyright (C) 1999,2001 
+ *
+ * Author: J.E.J.Bottomley@HansenPartnership.com
+ *
+ * linux/arch/i386/kernel/voyager.c
+ *
+ * This file contains all the voyager specific routines for getting
+ * initialisation of the architecture to function.  For additional
+ * features see:
+ *
+ *      voyager_cat.c - Voyager CAT bus interface
+ *      voyager_smp.c - Voyager SMP hal (emulates linux smp.c)
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+#include <linux/smp.h>
+#include <linux/nodemask.h>
+#include <asm/io.h>
+#include <asm/voyager.h>
+#include <asm/vic.h>
+#include <linux/pm.h>
+#include <asm/tlbflush.h>
+#include <asm/arch_hooks.h>
+#include <asm/i8253.h>
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+int voyager_level = 0;
+struct voyager_SUS *voyager_SUS = NULL;
+#ifdef CONFIG_SMP
+static void
+voyager_dump(int dummy1, struct tty_struct *dummy3)
+{
+        /* get here via a sysrq */
+        voyager_smp_dump();
+}
+static struct sysrq_key_op sysrq_voyager_dump_op = {
+        .handler        = voyager_dump,
+        .help_msg       = "Voyager",
+        .action_msg     = "Dump Voyager Status",
+};
+#endif
+void
+voyager_detect(struct voyager_bios_info *bios)
+{
+        if(bios->len != 0xff) {
+                int class = (bios->class_1 << 8) 
+                        | (bios->class_2 & 0xff);
+                printk("Voyager System detected.\n"
+                       "        Class %x, Revision %d.%d\n",
+                       class, bios->major, bios->minor);
+                if(class == VOYAGER_LEVEL4) 
+                        voyager_level = 4;
+                else if(class < VOYAGER_LEVEL5_AND_ABOVE)
+                        voyager_level = 3;
+                else
+                        voyager_level = 5;
+                printk("        Architecture Level %d\n", voyager_level);
+                if(voyager_level < 4)
+                        printk("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n");
+                /* install the power off handler */
+                pm_power_off = voyager_power_off;
+#ifdef CONFIG_SMP
+                register_sysrq_key('v', &sysrq_voyager_dump_op);
+#endif
+        } else {
+                printk("\n\n**WARNING**: No Voyager Subsystem Found\n");
+        }
+}
+void
+voyager_system_interrupt(int cpl, void *dev_id)
+{
+        printk("Voyager: detected system interrupt\n");
+}
+/* Routine to read information from the extended CMOS area */
+__u8
+voyager_extended_cmos_read(__u16 addr)
+{
+        outb(addr & 0xff, 0x74);
+        outb((addr >> 8) & 0xff, 0x75);
+        return inb(0x76);
+}
+/* internal definitions for the SUS Click Map of memory */
+#define CLICK_ENTRIES   16
+#define CLICK_SIZE      4096    /* click to byte conversion for Length */
+typedef struct ClickMap {
+        struct Entry {
+                __u32   Address;
+                __u32   Length;
+        } Entry[CLICK_ENTRIES];
+} ClickMap_t;
+/* This routine is pretty much an awful hack to read the bios clickmap by
+ * mapping it into page 0.  There are usually three regions in the map:
+ *      Base Memory
+ *      Extended Memory
+ *      zero length marker for end of map
+ *
+ * Returns are 0 for failure and 1 for success on extracting region.
+ */
+int __init
+voyager_memory_detect(int region, __u32 *start, __u32 *length)
+{
+        int i;
+        int retval = 0;
+        __u8 cmos[4];
+        ClickMap_t *map;
+        unsigned long map_addr;
+        unsigned long old;
+        if(region >= CLICK_ENTRIES) {
+                printk("Voyager: Illegal ClickMap region %d\n", region);
+                return 0;
+        }
+        for(i = 0; i < sizeof(cmos); i++)
+                cmos[i] = voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i);
+        map_addr = *(unsigned long *)cmos;
+        /* steal page 0 for this */
+        old = pg0[0];
+        pg0[0] = ((map_addr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
+        local_flush_tlb();
+        /* now clear everything out but page 0 */
+        map = (ClickMap_t *)(map_addr & (~PAGE_MASK));
+        /* zero length is the end of the clickmap */
+        if(map->Entry[region].Length != 0) {
+                *length = map->Entry[region].Length * CLICK_SIZE;
+                *start = map->Entry[region].Address;
+                retval = 1;
+        }
+        /* replace the mapping */
+        pg0[0] = old;
+        local_flush_tlb();
+        return retval;
+}
+/* voyager specific handling code for timer interrupts.  Used to hand
+ * off the timer tick to the SMP code, since the VIC doesn't have an
+ * internal timer (The QIC does, but that's another story). */
+void
+voyager_timer_interrupt(void)
+{
+        if((jiffies & 0x3ff) == 0) {
+                /* There seems to be something flaky in either
+                 * hardware or software that is resetting the timer 0
+                 * count to something much higher than it should be
+                 * This seems to occur in the boot sequence, just
+                 * before root is mounted.  Therefore, every 10
+                 * seconds or so, we sanity check the timer zero count
+                 * and kick it back to where it should be.
+                 *
+                 * FIXME: This is the most awful hack yet seen.  I
+                 * should work out exactly what is interfering with
+                 * the timer count settings early in the boot sequence
+                 * and swiftly introduce it to something sharp and
+                 * pointy.  */
+                __u16 val;
+                spin_lock(&i8253_lock);
+                
+                outb_p(0x00, 0x43);
+                val = inb_p(0x40);
+                val |= inb(0x40) << 8;
+                spin_unlock(&i8253_lock);
+                if(val > LATCH) {
+                        printk("\nVOYAGER: countdown timer value too high (%d), resetting\n\n", val);
+                        spin_lock(&i8253_lock);
+                        outb(0x34,0x43);
+                        outb_p(LATCH & 0xff , 0x40);    /* LSB */
+                        outb(LATCH >> 8 , 0x40);        /* MSB */
+                        spin_unlock(&i8253_lock);
+                }
+        }
+#ifdef CONFIG_SMP
+        smp_vic_timer_interrupt();
+#endif
+}
+void
+voyager_power_off(void)
+{
+        printk("VOYAGER Power Off\n");
+        if(voyager_level == 5) {
+                voyager_cat_power_off();
+        } else if(voyager_level == 4) {
+                /* This doesn't apparently work on most L4 machines,
+                 * but the specs say to do this to get automatic power
+                 * off.  Unfortunately, if it doesn't power off the
+                 * machine, it ends up doing a cold restart, which
+                 * isn't really intended, so comment out the code */
+#if 0
+                int port;
+          
+                /* enable the voyager Configuration Space */
+                outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, 
+                     VOYAGER_MC_SETUP);
+                /* the port for the power off flag is an offset from the
+                   floating base */
+                port = (inb(VOYAGER_SSPB_RELOCATION_PORT) << 8) + 0x21;
+                /* set the power off flag */
+                outb(inb(port) | 0x1, port);
+#endif
+        }
+        /* and wait for it to happen */
+        local_irq_disable();
+        for(;;)
+                halt();
+}
+/* copied from process.c */
+static inline void
+kb_wait(void)
+{
+        int i;
+        for (i=0; i<0x10000; i++)
+                if ((inb_p(0x64) & 0x02) == 0)
+                        break;
+}
+void
+machine_shutdown(void)
+{
+        /* Architecture specific shutdown needed before a kexec */
+}
+void
+machine_restart(char *cmd)
+{
+        printk("Voyager Warm Restart\n");
+        kb_wait();
+        if(voyager_level == 5) {
+                /* write magic values to the RTC to inform system that
+                 * shutdown is beginning */
+                outb(0x8f, 0x70);
+                outb(0x5 , 0x71);
+                
+                udelay(50);
+                outb(0xfe,0x64);         /* pull reset low */
+        } else if(voyager_level == 4) {
+                __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8;
+                __u8 basebd = inb(VOYAGER_MC_SETUP);
+                
+                outb(basebd | 0x08, VOYAGER_MC_SETUP);
+                outb(0x02, catbase + 0x21);
+        }
+        local_irq_disable();
+        for(;;)
+                halt();
+}
+void
+machine_emergency_restart(void)
+{
+        /*for now, just hook this to a warm restart */
+        machine_restart(NULL);
+}
+void
+mca_nmi_hook(void)
+{
+        __u8 dumpval __maybe_unused = inb(0xf823);
+        __u8 swnmi __maybe_unused = inb(0xf813);
+        /* FIXME: assume dump switch pressed */
+        /* check to see if the dump switch was pressed */
+        VDEBUG(("VOYAGER: dumpval = 0x%x, swnmi = 0x%x\n", dumpval, swnmi));
+        /* clear swnmi */
+        outb(0xff, 0xf813);
+        /* tell SUS to ignore dump */
+        if(voyager_level == 5 && voyager_SUS != NULL) {
+                if(voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) {
+                        voyager_SUS->kernel_mbox = VOYAGER_NO_COMMAND;
+                        voyager_SUS->kernel_flags |= VOYAGER_OS_IN_PROGRESS;
+                        udelay(1000);
+                        voyager_SUS->kernel_mbox = VOYAGER_IGNORE_DUMP;
+                        voyager_SUS->kernel_flags &= ~VOYAGER_OS_IN_PROGRESS;
+                }
+        }
+        printk(KERN_ERR "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n", smp_processor_id());
+        show_stack(NULL, NULL);
+        show_state();
+}
+void
+machine_halt(void)
+{
+        /* treat a halt like a power off */
+        machine_power_off();
+}
+void machine_power_off(void)
+{
+        if (pm_power_off)
+                pm_power_off();
+}
diff --git a/arch/x86/mach-voyager/voyager_cat.c b/arch/x86/mach-voyager/voyager_cat.c
new file mode 100644
index 000000000000..26a2d4c54b68
--- /dev/null
+++ b/arch/x86/mach-voyager/voyager_cat.c
@@ -0,0 +1,1180 @@
+/* -*- mode: c; c-basic-offset: 8 -*- */
+/* Copyright (C) 1999,2001
+ *
+ * Author: J.E.J.Bottomley@HansenPartnership.com
+ *
+ * linux/arch/i386/kernel/voyager_cat.c
+ *
+ * This file contains all the logic for manipulating the CAT bus
+ * in a level 5 machine.
+ *
+ * The CAT bus is a serial configuration and test bus.  Its primary
+ * uses are to probe the initial configuration of the system and to
+ * diagnose error conditions when a system interrupt occurs.  The low
+ * level interface is fairly primitive, so most of this file consists
+ * of bit shift manipulations to send and receive packets on the
+ * serial bus */
+#include <linux/types.h>
+#include <linux/completion.h>
+#include <linux/sched.h>
+#include <asm/voyager.h>
+#include <asm/vic.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <asm/io.h>
+#ifdef VOYAGER_CAT_DEBUG
+#define CDEBUG(x)       printk x
+#else
+#define CDEBUG(x)
+#endif
+/* the CAT command port */
+#define CAT_CMD         (sspb + 0xe)
+/* the CAT data port */
+#define CAT_DATA        (sspb + 0xd)
+/* the internal cat functions */
+static void cat_pack(__u8 *msg, __u16 start_bit, __u8 *data, 
+                     __u16 num_bits);
+static void cat_unpack(__u8 *msg, __u16 start_bit, __u8 *data,
+                       __u16 num_bits);
+static void cat_build_header(__u8 *header, const __u16 len, 
+                             const __u16 smallest_reg_bits,
+                             const __u16 longest_reg_bits);
+static int cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp,
+                        __u8 reg, __u8 op);
+static int cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp,
+                       __u8 reg, __u8 *value);
+static int cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes,
+                        __u8 pad_bits);
+static int cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
+                     __u8 value);
+static int cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
+                    __u8 *value);
+static int cat_subread(voyager_module_t *modp, voyager_asic_t *asicp,
+                       __u16 offset, __u16 len, void *buf);
+static int cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp,
+                        __u8 reg, __u8 value);
+static int cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp);
+static int cat_connect(voyager_module_t *modp, voyager_asic_t *asicp);
+static inline const char *
+cat_module_name(int module_id)
+{
+        switch(module_id) {
+        case 0x10:
+                return "Processor Slot 0";
+        case 0x11:
+                return "Processor Slot 1";
+        case 0x12:
+                return "Processor Slot 2";
+        case 0x13:
+                return "Processor Slot 4";
+        case 0x14:
+                return "Memory Slot 0";
+        case 0x15:
+                return "Memory Slot 1";
+        case 0x18:
+                return "Primary Microchannel";
+        case 0x19:
+                return "Secondary Microchannel";
+        case 0x1a:
+                return "Power Supply Interface";
+        case 0x1c:
+                return "Processor Slot 5";
+        case 0x1d:
+                return "Processor Slot 6";
+        case 0x1e:
+                return "Processor Slot 7";
+        case 0x1f:
+                return "Processor Slot 8";
+        default:
+                return "Unknown Module";
+        }
+}
+static int sspb = 0;            /* stores the super port location */
+int voyager_8slot = 0;          /* set to true if a 51xx monster */
+voyager_module_t *voyager_cat_list;
+/* the I/O port assignments for the VIC and QIC */
+static struct resource vic_res = {
+        .name   = "Voyager Interrupt Controller",
+        .start  = 0xFC00,
+        .end    = 0xFC6F
+};
+static struct resource qic_res = {
+        .name   = "Quad Interrupt Controller",
+        .start  = 0xFC70,
+        .end    = 0xFCFF
+};
+/* This function is used to pack a data bit stream inside a message.
+ * It writes num_bits of the data buffer in msg starting at start_bit.
+ * Note: This function assumes that any unused bit in the data stream
+ * is set to zero so that the ors will work correctly */
+static void
+cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
+{
+        /* compute initial shift needed */
+        const __u16 offset = start_bit % BITS_PER_BYTE;
+        __u16 len = num_bits / BITS_PER_BYTE;
+        __u16 byte = start_bit / BITS_PER_BYTE;
+        __u16 residue = (num_bits % BITS_PER_BYTE) + offset;
+        int i;
+        /* adjust if we have more than a byte of residue */
+        if(residue >= BITS_PER_BYTE) {
+                residue -= BITS_PER_BYTE;
+                len++;
+        }
+        /* clear out the bits.  We assume here that if len==0 then
+         * residue >= offset.  This is always true for the catbus
+         * operations */
+        msg[byte] &= 0xff << (BITS_PER_BYTE - offset); 
+        msg[byte++] |= data[0] >> offset;
+        if(len == 0)
+                return;
+        for(i = 1; i < len; i++)
+                msg[byte++] = (data[i-1] << (BITS_PER_BYTE - offset))
+                        | (data[i] >> offset);
+        if(residue != 0) {
+                __u8 mask = 0xff >> residue;
+                __u8 last_byte = data[i-1] << (BITS_PER_BYTE - offset)
+                        | (data[i] >> offset);
+                
+                last_byte &= ~mask;
+                msg[byte] &= mask;
+                msg[byte] |= last_byte;
+        }
+        return;
+}
+/* unpack the data again (same arguments as cat_pack()). data buffer
+ * must be zero populated.
+ *
+ * Function: given a message string move to start_bit and copy num_bits into
+ * data (starting at bit 0 in data).
+ */
+static void
+cat_unpack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
+{
+        /* compute initial shift needed */
+        const __u16 offset = start_bit % BITS_PER_BYTE;
+        __u16 len = num_bits / BITS_PER_BYTE;
+        const __u8 last_bits = num_bits % BITS_PER_BYTE;
+        __u16 byte = start_bit / BITS_PER_BYTE;
+        int i;
+        if(last_bits != 0)
+                len++;
+        /* special case: want < 8 bits from msg and we can get it from
+         * a single byte of the msg */
+        if(len == 0 && BITS_PER_BYTE - offset >= num_bits) {
+                data[0] = msg[byte] << offset;
+                data[0] &= 0xff >> (BITS_PER_BYTE - num_bits);
+                return;
+        }
+        for(i = 0; i < len; i++) {
+                /* this annoying if has to be done just in case a read of
+                 * msg one beyond the array causes a panic */
+                if(offset != 0) {
+                        data[i] = msg[byte++] << offset;
+                        data[i] |= msg[byte] >> (BITS_PER_BYTE - offset);
+                }
+                else {
+                        data[i] = msg[byte++];
+                }
+        }
+        /* do we need to truncate the final byte */
+        if(last_bits != 0) {
+                data[i-1] &= 0xff << (BITS_PER_BYTE - last_bits);
+        }
+        return;
+}
+static void
+cat_build_header(__u8 *header, const __u16 len, const __u16 smallest_reg_bits,
+                 const __u16 longest_reg_bits)
+{
+        int i;
+        __u16 start_bit = (smallest_reg_bits - 1) % BITS_PER_BYTE;
+        __u8 *last_byte = &header[len - 1];
+        if(start_bit == 0)
+                start_bit = 1;  /* must have at least one bit in the hdr */
+        
+        for(i=0; i < len; i++)
+                header[i] = 0;
+        for(i = start_bit; i > 0; i--)
+                *last_byte = ((*last_byte) << 1) + 1;
+}
+static int
+cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op)
+{
+        __u8 parity, inst, inst_buf[4] = { 0 };
+        __u8 iseq[VOYAGER_MAX_SCAN_PATH], hseq[VOYAGER_MAX_REG_SIZE];
+        __u16 ibytes, hbytes, padbits;
+        int i;
+        
+        /* 
+         * Parity is the parity of the register number + 1 (READ_REGISTER
+         * and WRITE_REGISTER always add '1' to the number of bits == 1)
+         */
+        parity = (__u8)(1 + (reg & 0x01) +
+                 ((__u8)(reg & 0x02) >> 1) +
+                 ((__u8)(reg & 0x04) >> 2) +
+                 ((__u8)(reg & 0x08) >> 3)) % 2;
+        inst = ((parity << 7) | (reg << 2) | op);
+        outb(VOYAGER_CAT_IRCYC, CAT_CMD);
+        if(!modp->scan_path_connected) {
+                if(asicp->asic_id != VOYAGER_CAT_ID) {
+                        printk("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n");
+                        return 1;
+                }
+                outb(VOYAGER_CAT_HEADER, CAT_DATA);
+                outb(inst, CAT_DATA);
+                if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
+                        CDEBUG(("VOYAGER CAT: cat_sendinst failed to get CAT_HEADER\n"));
+                        return 1;
+                }
+                return 0;
+        }
+        ibytes = modp->inst_bits / BITS_PER_BYTE;
+        if((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) {
+                padbits = BITS_PER_BYTE - padbits;
+                ibytes++;
+        }
+        hbytes = modp->largest_reg / BITS_PER_BYTE;
+        if(modp->largest_reg % BITS_PER_BYTE)
+                hbytes++;
+        CDEBUG(("cat_sendinst: ibytes=%d, hbytes=%d\n", ibytes, hbytes));
+        /* initialise the instruction sequence to 0xff */
+        for(i=0; i < ibytes + hbytes; i++)
+                iseq[i] = 0xff;
+        cat_build_header(hseq, hbytes, modp->smallest_reg, modp->largest_reg);
+        cat_pack(iseq, modp->inst_bits, hseq, hbytes * BITS_PER_BYTE);
+        inst_buf[0] = inst;
+        inst_buf[1] = 0xFF >> (modp->largest_reg % BITS_PER_BYTE);
+        cat_pack(iseq, asicp->bit_location, inst_buf, asicp->ireg_length);
+#ifdef VOYAGER_CAT_DEBUG
+        printk("ins = 0x%x, iseq: ", inst);
+        for(i=0; i< ibytes + hbytes; i++)
+                printk("0x%x ", iseq[i]);
+        printk("\n");
+#endif
+        if(cat_shiftout(iseq, ibytes, hbytes, padbits)) {
+                CDEBUG(("VOYAGER CAT: cat_sendinst: cat_shiftout failed\n"));
+                return 1;
+        }
+        CDEBUG(("CAT SHIFTOUT DONE\n"));
+        return 0;
+}
+static int
+cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, 
+            __u8 *value)
+{
+        if(!modp->scan_path_connected) {
+                if(asicp->asic_id != VOYAGER_CAT_ID) {
+                        CDEBUG(("VOYAGER CAT: ERROR: cat_getdata to CAT asic with scan path connected\n"));
+                        return 1;
+                }
+                if(reg > VOYAGER_SUBADDRHI) 
+                        outb(VOYAGER_CAT_RUN, CAT_CMD);
+                outb(VOYAGER_CAT_DRCYC, CAT_CMD);
+                outb(VOYAGER_CAT_HEADER, CAT_DATA);
+                *value = inb(CAT_DATA);
+                outb(0xAA, CAT_DATA);
+                if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
+                        CDEBUG(("cat_getdata: failed to get VOYAGER_CAT_HEADER\n"));
+                        return 1;
+                }
+                return 0;
+        }
+        else {
+                __u16 sbits = modp->num_asics -1 + asicp->ireg_length;
+                __u16 sbytes = sbits / BITS_PER_BYTE;
+                __u16 tbytes;
+                __u8 string[VOYAGER_MAX_SCAN_PATH], trailer[VOYAGER_MAX_REG_SIZE];
+                __u8 padbits;
+                int i;
+                
+                outb(VOYAGER_CAT_DRCYC, CAT_CMD);
+                if((padbits = sbits % BITS_PER_BYTE) != 0) {
+                        padbits = BITS_PER_BYTE - padbits;
+                        sbytes++;
+                }
+                tbytes = asicp->ireg_length / BITS_PER_BYTE;
+                if(asicp->ireg_length % BITS_PER_BYTE)
+                        tbytes++;
+                CDEBUG(("cat_getdata: tbytes = %d, sbytes = %d, padbits = %d\n",
+                        tbytes, sbytes, padbits));
+                cat_build_header(trailer, tbytes, 1, asicp->ireg_length);
+                
+                for(i = tbytes - 1; i >= 0; i--) {
+                        outb(trailer[i], CAT_DATA);
+                        string[sbytes + i] = inb(CAT_DATA);
+                }
+                for(i = sbytes - 1; i >= 0; i--) {
+                        outb(0xaa, CAT_DATA);
+                        string[i] = inb(CAT_DATA);
+                }
+                *value = 0;
+                cat_unpack(string, padbits + (tbytes * BITS_PER_BYTE) + asicp->asic_location, value, asicp->ireg_length);
+#ifdef VOYAGER_CAT_DEBUG
+                printk("value=0x%x, string: ", *value);
+                for(i=0; i< tbytes+sbytes; i++)
+                        printk("0x%x ", string[i]);
+                printk("\n");
+#endif
+                
+                /* sanity check the rest of the return */
+                for(i=0; i < tbytes; i++) {
+                        __u8 input = 0;
+                        cat_unpack(string, padbits + (i * BITS_PER_BYTE), &input, BITS_PER_BYTE);
+                        if(trailer[i] != input) {
+                                CDEBUG(("cat_getdata: failed to sanity check rest of ret(%d) 0x%x != 0x%x\n", i, input, trailer[i]));
+                                return 1;
+                        }
+                }
+                CDEBUG(("cat_getdata DONE\n"));
+                return 0;
+        }
+}
+static int
+cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits)
+{
+        int i;
+        
+        for(i = data_bytes + header_bytes - 1; i >= header_bytes; i--)
+                outb(data[i], CAT_DATA);
+        for(i = header_bytes - 1; i >= 0; i--) {
+                __u8 header = 0;
+                __u8 input;
+                outb(data[i], CAT_DATA);
+                input = inb(CAT_DATA);
+                CDEBUG(("cat_shiftout: returned 0x%x\n", input));
+                cat_unpack(data, ((data_bytes + i) * BITS_PER_BYTE) - pad_bits,
+                           &header, BITS_PER_BYTE);
+                if(input != header) {
+                        CDEBUG(("VOYAGER CAT: cat_shiftout failed to return header 0x%x != 0x%x\n", input, header));
+                        return 1;
+                }
+        }
+        return 0;
+}
+static int
+cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, 
+             __u8 reg, __u8 value)
+{
+        outb(VOYAGER_CAT_DRCYC, CAT_CMD);
+        if(!modp->scan_path_connected) {
+                if(asicp->asic_id != VOYAGER_CAT_ID) {
+                        CDEBUG(("VOYAGER CAT: ERROR: scan path disconnected when asic != CAT\n"));
+                        return 1;
+                }
+                outb(VOYAGER_CAT_HEADER, CAT_DATA);
+                outb(value, CAT_DATA);
+                if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
+                        CDEBUG(("cat_senddata: failed to get correct header response to sent data\n"));
+                        return 1;
+                }
+                if(reg > VOYAGER_SUBADDRHI) {
+                        outb(VOYAGER_CAT_RUN, CAT_CMD);
+                        outb(VOYAGER_CAT_END, CAT_CMD);
+                        outb(VOYAGER_CAT_RUN, CAT_CMD);
+                }
+                
+                return 0;
+        }
+        else {
+                __u16 hbytes = asicp->ireg_length / BITS_PER_BYTE;
+                __u16 dbytes = (modp->num_asics - 1 + asicp->ireg_length)/BITS_PER_BYTE;
+                __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH], 
+                        hseq[VOYAGER_MAX_REG_SIZE];
+                int i;
+                if((padbits = (modp->num_asics - 1 
+                               + asicp->ireg_length) % BITS_PER_BYTE) != 0) {
+                        padbits = BITS_PER_BYTE - padbits;
+                        dbytes++;
+                }
+                if(asicp->ireg_length % BITS_PER_BYTE)
+                        hbytes++;
+                
+                cat_build_header(hseq, hbytes, 1, asicp->ireg_length);
+                
+                for(i = 0; i < dbytes + hbytes; i++)
+                        dseq[i] = 0xff;
+                CDEBUG(("cat_senddata: dbytes=%d, hbytes=%d, padbits=%d\n",
+                        dbytes, hbytes, padbits));
+                cat_pack(dseq, modp->num_asics - 1 + asicp->ireg_length,
+                         hseq, hbytes * BITS_PER_BYTE);
+                cat_pack(dseq, asicp->asic_location, &value, 
+                         asicp->ireg_length);
+#ifdef VOYAGER_CAT_DEBUG
+                printk("dseq ");
+                for(i=0; i<hbytes+dbytes; i++) {
+                        printk("0x%x ", dseq[i]);
+                }
+                printk("\n");
+#endif
+                return cat_shiftout(dseq, dbytes, hbytes, padbits);
+        }
+}
+static int
+cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
+         __u8 value)
+{
+        if(cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG))
+                return 1;
+        return cat_senddata(modp, asicp, reg, value);
+}
+static int
+cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
+         __u8 *value)
+{
+        if(cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG))
+                return 1;
+        return cat_getdata(modp, asicp, reg, value);
+}
+static int
+cat_subaddrsetup(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset,
+                 __u16 len)
+{
+        __u8 val;
+        if(len > 1) {
+                /* set auto increment */
+                __u8 newval;
+                
+                if(cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) {
+                        CDEBUG(("cat_subaddrsetup: read of VOYAGER_AUTO_INC_REG failed\n"));
+                        return 1;
+                }
+                CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n", val));
+                newval = val | VOYAGER_AUTO_INC;
+                if(newval != val) {
+                        if(cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) {
+                                CDEBUG(("cat_subaddrsetup: write to VOYAGER_AUTO_INC_REG failed\n"));
+                                return 1;
+                        }
+                }
+        }
+        if(cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8)(offset &0xff))) {
+                CDEBUG(("cat_subaddrsetup: write to SUBADDRLO failed\n"));
+                return 1;
+        }
+        if(asicp->subaddr > VOYAGER_SUBADDR_LO) {
+                if(cat_write(modp, asicp, VOYAGER_SUBADDRHI, (__u8)(offset >> 8))) {
+                        CDEBUG(("cat_subaddrsetup: write to SUBADDRHI failed\n"));
+                        return 1;
+                }
+                cat_read(modp, asicp, VOYAGER_SUBADDRHI, &val);
+                CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset, val));
+        }
+        cat_read(modp, asicp, VOYAGER_SUBADDRLO, &val);
+        CDEBUG(("cat_subaddrsetup: offset = %d, lo = %d\n", offset, val));
+        return 0;
+}
+                
+static int
+cat_subwrite(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset,
+            __u16 len, void *buf)
+{
+        int i, retval;
+        /* FIXME: need special actions for VOYAGER_CAT_ID here */
+        if(asicp->asic_id == VOYAGER_CAT_ID) {
+                CDEBUG(("cat_subwrite: ATTEMPT TO WRITE TO CAT ASIC\n"));
+                /* FIXME -- This is supposed to be handled better
+                 * There is a problem writing to the cat asic in the
+                 * PSI.  The 30us delay seems to work, though */
+                udelay(30);
+        }
+                
+        if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
+                printk("cat_subwrite: cat_subaddrsetup FAILED\n");
+                return retval;
+        }
+        
+        if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) {
+                printk("cat_subwrite: cat_sendinst FAILED\n");
+                return 1;
+        }
+        for(i = 0; i < len; i++) {
+                if(cat_senddata(modp, asicp, 0xFF, ((__u8 *)buf)[i])) {
+                        printk("cat_subwrite: cat_sendata element at %d FAILED\n", i);
+                        return 1;
+                }
+        }
+        return 0;
+}
+static int
+cat_subread(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset,
+            __u16 len, void *buf)
+{
+        int i, retval;
+        if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
+                CDEBUG(("cat_subread: cat_subaddrsetup FAILED\n"));
+                return retval;
+        }
+        if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) {
+                CDEBUG(("cat_subread: cat_sendinst failed\n"));
+                return 1;
+        }
+        for(i = 0; i < len; i++) {
+                if(cat_getdata(modp, asicp, 0xFF,
+                               &((__u8 *)buf)[i])) {
+                        CDEBUG(("cat_subread: cat_getdata element %d failed\n", i));
+                        return 1;
+                }
+        }
+        return 0;
+}
+/* buffer for storing EPROM data read in during initialisation */
+static __initdata __u8 eprom_buf[0xFFFF];
+static voyager_module_t *voyager_initial_module;
+/* Initialise the cat bus components.  We assume this is called by the
+ * boot cpu *after* all memory initialisation has been done (so we can
+ * use kmalloc) but before smp initialisation, so we can probe the SMP
+ * configuration and pick up necessary information.  */
+void
+voyager_cat_init(void)
+{
+        voyager_module_t **modpp = &voyager_initial_module;
+        voyager_asic_t **asicpp;
+        voyager_asic_t *qabc_asic = NULL;
+        int i, j;
+        unsigned long qic_addr = 0;
+        __u8 qabc_data[0x20];
+        __u8 num_submodules, val;
+        voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *)&eprom_buf[0];
+        
+        __u8 cmos[4];
+        unsigned long addr;
+        
+        /* initiallise the SUS mailbox */
+        for(i=0; i<sizeof(cmos); i++)
+                cmos[i] = voyager_extended_cmos_read(VOYAGER_DUMP_LOCATION + i);
+        addr = *(unsigned long *)cmos;
+        if((addr & 0xff000000) != 0xff000000) {
+                printk(KERN_ERR "Voyager failed to get SUS mailbox (addr = 0x%lx\n", addr);
+        } else {
+                static struct resource res;
+                
+                res.name = "voyager SUS";
+                res.start = addr;
+                res.end = addr+0x3ff;
+                
+                request_resource(&iomem_resource, &res);
+                voyager_SUS = (struct voyager_SUS *)
+                        ioremap(addr, 0x400);
+                printk(KERN_NOTICE "Voyager SUS mailbox version 0x%x\n",
+                       voyager_SUS->SUS_version);
+                voyager_SUS->kernel_version = VOYAGER_MAILBOX_VERSION;
+                voyager_SUS->kernel_flags = VOYAGER_OS_HAS_SYSINT;
+        }
+        /* clear the processor counts */
+        voyager_extended_vic_processors = 0;
+        voyager_quad_processors = 0;
+        printk("VOYAGER: beginning CAT bus probe\n");
+        /* set up the SuperSet Port Block which tells us where the
+         * CAT communication port is */
+        sspb = inb(VOYAGER_SSPB_RELOCATION_PORT) * 0x100;
+        VDEBUG(("VOYAGER DEBUG: sspb = 0x%x\n", sspb));
+        /* now find out if were 8 slot or normal */
+        if((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER)
+           == EIGHT_SLOT_IDENTIFIER) {
+                voyager_8slot = 1;
+                printk(KERN_NOTICE "Voyager: Eight slot 51xx configuration detected\n");
+        }
+        for(i = VOYAGER_MIN_MODULE;
+            i <= VOYAGER_MAX_MODULE; i++) {
+                __u8 input;
+                int asic;
+                __u16 eprom_size;
+                __u16 sp_offset;
+                outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT);
+                outb(i, VOYAGER_CAT_CONFIG_PORT);
+                /* check the presence of the module */
+                outb(VOYAGER_CAT_RUN, CAT_CMD);
+                outb(VOYAGER_CAT_IRCYC, CAT_CMD);
+                outb(VOYAGER_CAT_HEADER, CAT_DATA);
+                /* stream series of alternating 1's and 0's to stimulate
+                 * response */
+                outb(0xAA, CAT_DATA);
+                input = inb(CAT_DATA);
+                outb(VOYAGER_CAT_END, CAT_CMD);
+                if(input != VOYAGER_CAT_HEADER) {
+                        continue;
+                }
+                CDEBUG(("VOYAGER DEBUG: found module id 0x%x, %s\n", i,
+                        cat_module_name(i)));
+                *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++];*/
+                if(*modpp == NULL) {
+                        printk("**WARNING** kmalloc failure in cat_init\n");
+                        continue;
+                }
+                memset(*modpp, 0, sizeof(voyager_module_t));
+                /* need temporary asic for cat_subread.  It will be
+                 * filled in correctly later */
+                (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count];*/
+                if((*modpp)->asic == NULL) {
+                        printk("**WARNING** kmalloc failure in cat_init\n");
+                        continue;
+                }
+                memset((*modpp)->asic, 0, sizeof(voyager_asic_t));
+                (*modpp)->asic->asic_id = VOYAGER_CAT_ID;
+                (*modpp)->asic->subaddr = VOYAGER_SUBADDR_HI;
+                (*modpp)->module_addr = i;
+                (*modpp)->scan_path_connected = 0;
+                if(i == VOYAGER_PSI) {
+                        /* Exception leg for modules with no EEPROM */
+                        printk("Module \"%s\"\n", cat_module_name(i));
+                        continue;
+                }
+                               
+                CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET));
+                outb(VOYAGER_CAT_RUN, CAT_CMD);
+                cat_disconnect(*modpp, (*modpp)->asic);
+                if(cat_subread(*modpp, (*modpp)->asic,
+                               VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
+                               &eprom_size)) {
+                        printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i);
+                        outb(VOYAGER_CAT_END, CAT_CMD);
+                        continue;
+                }
+                if(eprom_size > sizeof(eprom_buf)) {
+                        printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x.  Need %d\n", i, eprom_size);
+                        outb(VOYAGER_CAT_END, CAT_CMD);
+                        continue;
+                }
+                outb(VOYAGER_CAT_END, CAT_CMD);
+                outb(VOYAGER_CAT_RUN, CAT_CMD);
+                CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size));
+                if(cat_subread(*modpp, (*modpp)->asic, 0, 
+                               eprom_size, eprom_buf)) {
+                        outb(VOYAGER_CAT_END, CAT_CMD);
+                        continue;
+                }
+                outb(VOYAGER_CAT_END, CAT_CMD);
+                printk("Module \"%s\", version 0x%x, tracer 0x%x, asics %d\n",
+                       cat_module_name(i), eprom_hdr->version_id,
+                       *((__u32 *)eprom_hdr->tracer),  eprom_hdr->num_asics);
+                (*modpp)->ee_size = eprom_hdr->ee_size;
+                (*modpp)->num_asics = eprom_hdr->num_asics;
+                asicpp = &((*modpp)->asic);
+                sp_offset = eprom_hdr->scan_path_offset;
+                /* All we really care about are the Quad cards.  We
+                 * identify them because they are in a processor slot
+                 * and have only four asics */
+                if((i < 0x10 || (i>=0x14 && i < 0x1c) || i>0x1f)) {
+                        modpp = &((*modpp)->next);
+                        continue;
+                }
+                /* Now we know it's in a processor slot, does it have
+                 * a quad baseboard submodule */
+                outb(VOYAGER_CAT_RUN, CAT_CMD);
+                cat_read(*modpp, (*modpp)->asic, VOYAGER_SUBMODPRESENT,
+                         &num_submodules);
+                /* lowest two bits, active low */
+                num_submodules = ~(0xfc | num_submodules);
+                CDEBUG(("VOYAGER CAT: %d submodules present\n", num_submodules));
+                if(num_submodules == 0) {
+                        /* fill in the dyadic extended processors */
+                        __u8 cpu = i & 0x07;
+                        printk("Module \"%s\": Dyadic Processor Card\n",
+                               cat_module_name(i));
+                        voyager_extended_vic_processors |= (1<<cpu);
+                        cpu += 4;
+                        voyager_extended_vic_processors |= (1<<cpu);
+                        outb(VOYAGER_CAT_END, CAT_CMD);
+                        continue;
+                }
+                /* now we want to read the asics on the first submodule,
+                 * which should be the quad base board */
+                cat_read(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, &val);
+                CDEBUG(("cat_init: SUBMODSELECT value = 0x%x\n", val));
+                val = (val & 0x7c) | VOYAGER_QUAD_BASEBOARD;
+                cat_write(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, val);
+                outb(VOYAGER_CAT_END, CAT_CMD);
+                         
+                CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET));
+                outb(VOYAGER_CAT_RUN, CAT_CMD);
+                cat_disconnect(*modpp, (*modpp)->asic);
+                if(cat_subread(*modpp, (*modpp)->asic,
+                               VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
+                               &eprom_size)) {
+                        printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i);
+                        outb(VOYAGER_CAT_END, CAT_CMD);
+                        continue;
+                }
+                if(eprom_size > sizeof(eprom_buf)) {
+                        printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x.  Need %d\n", i, eprom_size);
+                        outb(VOYAGER_CAT_END, CAT_CMD);
+                        continue;
+                }
+                outb(VOYAGER_CAT_END, CAT_CMD);
+                outb(VOYAGER_CAT_RUN, CAT_CMD);
+                CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size));
+                if(cat_subread(*modpp, (*modpp)->asic, 0, 
+                               eprom_size, eprom_buf)) {
+                        outb(VOYAGER_CAT_END, CAT_CMD);
+                        continue;
+                }
+                outb(VOYAGER_CAT_END, CAT_CMD);
+                /* Now do everything for the QBB submodule 1 */
+                (*modpp)->ee_size = eprom_hdr->ee_size;
+                (*modpp)->num_asics = eprom_hdr->num_asics;
+                asicpp = &((*modpp)->asic);
+                sp_offset = eprom_hdr->scan_path_offset;
+                /* get rid of the dummy CAT asic and read the real one */
+                kfree((*modpp)->asic);
+                for(asic=0; asic < (*modpp)->num_asics; asic++) {
+                        int j;
+                        voyager_asic_t *asicp = *asicpp 
+                                = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/
+                        voyager_sp_table_t *sp_table;
+                        voyager_at_t *asic_table;
+                        voyager_jtt_t *jtag_table;
+                        if(asicp == NULL) {
+                                printk("**WARNING** kmalloc failure in cat_init\n");
+                                continue;
+                        }
+                        asicpp = &(asicp->next);
+                        asicp->asic_location = asic;
+                        sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset);
+                        asicp->asic_id = sp_table->asic_id;
+                        asic_table = (voyager_at_t *)(eprom_buf + sp_table->asic_data_offset);
+                        for(j=0; j<4; j++)
+                                asicp->jtag_id[j] = asic_table->jtag_id[j];
+                        jtag_table = (voyager_jtt_t *)(eprom_buf + asic_table->jtag_offset);
+                        asicp->ireg_length = jtag_table->ireg_len;
+                        asicp->bit_location = (*modpp)->inst_bits;
+                        (*modpp)->inst_bits += asicp->ireg_length;
+                        if(asicp->ireg_length > (*modpp)->largest_reg)
+                                (*modpp)->largest_reg = asicp->ireg_length;
+                        if (asicp->ireg_length < (*modpp)->smallest_reg ||
+                            (*modpp)->smallest_reg == 0)
+                                (*modpp)->smallest_reg = asicp->ireg_length;
+                        CDEBUG(("asic 0x%x, ireg_length=%d, bit_location=%d\n",
+                                asicp->asic_id, asicp->ireg_length,
+                                asicp->bit_location));
+                        if(asicp->asic_id == VOYAGER_QUAD_QABC) {
+                                CDEBUG(("VOYAGER CAT: QABC ASIC found\n"));
+                                qabc_asic = asicp;
+                        }
+                        sp_offset += sizeof(voyager_sp_table_t);
+                }
+                CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n",
+                        (*modpp)->inst_bits, (*modpp)->largest_reg,
+                        (*modpp)->smallest_reg));
+                /* OK, now we have the QUAD ASICs set up, use them.
+                 * we need to:
+                 *
+                 * 1. Find the Memory area for the Quad CPIs.
+                 * 2. Find the Extended VIC processor
+                 * 3. Configure a second extended VIC processor (This
+                 *    cannot be done for the 51xx.
+                 * */
+                outb(VOYAGER_CAT_RUN, CAT_CMD);
+                cat_connect(*modpp, (*modpp)->asic);
+                CDEBUG(("CAT CONNECTED!!\n"));
+                cat_subread(*modpp, qabc_asic, 0, sizeof(qabc_data), qabc_data);
+                qic_addr = qabc_data[5] << 8;
+                qic_addr = (qic_addr | qabc_data[6]) << 8;
+                qic_addr = (qic_addr | qabc_data[7]) << 8;
+                printk("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n",
+                       cat_module_name(i), qic_addr, qabc_data[8]);
+#if 0                           /* plumbing fails---FIXME */
+                if((qabc_data[8] & 0xf0) == 0) {
+                        /* FIXME: 32 way 8 CPU slot monster cannot be
+                         * plumbed this way---need to check for it */
+                        printk("Plumbing second Extended Quad Processor\n");
+                        /* second VIC line hardwired to Quad CPU 1 */
+                        qabc_data[8] |= 0x20;
+                        cat_subwrite(*modpp, qabc_asic, 8, 1, &qabc_data[8]);
+#ifdef VOYAGER_CAT_DEBUG
+                        /* verify plumbing */
+                        cat_subread(*modpp, qabc_asic, 8, 1, &qabc_data[8]);
+                        if((qabc_data[8] & 0xf0) == 0) {
+                                CDEBUG(("PLUMBING FAILED: 0x%x\n", qabc_data[8]));
+                        }
+#endif
+                }
+#endif
+                {
+                        struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL);
+                        res->name = kmalloc(128, GFP_KERNEL);
+                        sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i));
+                        res->start = qic_addr;
+                        res->end = qic_addr + 0x3ff;
+                        request_resource(&iomem_resource, res);
+                }
+                qic_addr = (unsigned long)ioremap(qic_addr, 0x400);
+                                
+                for(j = 0; j < 4; j++) {
+                        __u8 cpu;
+                        if(voyager_8slot) {
+                                /* 8 slot has a different mapping,
+                                 * each slot has only one vic line, so
+                                 * 1 cpu in each slot must be < 8 */
+                                cpu = (i & 0x07) + j*8;
+                        } else {
+                                cpu = (i & 0x03) + j*4;
+                        }
+                        if( (qabc_data[8] & (1<<j))) {
+                                voyager_extended_vic_processors |= (1<<cpu);
+                        }
+                        if(qabc_data[8] & (1<<(j+4)) ) {
+                                /* Second SET register plumbed: Quad
+                                 * card has two VIC connected CPUs.
+                                 * Secondary cannot be booted as a VIC
+                                 * CPU */
+                                voyager_extended_vic_processors |= (1<<cpu);
+                                voyager_allowed_boot_processors &= (~(1<<cpu));
+                        }
+                        voyager_quad_processors |= (1<<cpu);
+                        voyager_quad_cpi_addr[cpu] = (struct voyager_qic_cpi *)
+                                (qic_addr+(j<<8));
+                        CDEBUG(("CPU%d: CPI address 0x%lx\n", cpu,
+                                (unsigned long)voyager_quad_cpi_addr[cpu]));
+                }
+                outb(VOYAGER_CAT_END, CAT_CMD);
+                
+                
+                *asicpp = NULL;
+                modpp = &((*modpp)->next);
+        }
+        *modpp = NULL;
+        printk("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n", voyager_extended_vic_processors, voyager_quad_processors, voyager_allowed_boot_processors);
+        request_resource(&ioport_resource, &vic_res);
+        if(voyager_quad_processors)
+                request_resource(&ioport_resource, &qic_res);
+        /* set up the front power switch */
+}
+int
+voyager_cat_readb(__u8 module, __u8 asic, int reg)
+{
+        return 0;
+}
+static int
+cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp) 
+{
+        __u8 val;
+        int err = 0;
+        if(!modp->scan_path_connected)
+                return 0;
+        if(asicp->asic_id != VOYAGER_CAT_ID) {
+                CDEBUG(("cat_disconnect: ASIC is not CAT\n"));
+                return 1;
+        }
+        err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val);
+        if(err) {
+                CDEBUG(("cat_disconnect: failed to read SCANPATH\n"));
+                return err;
+        }
+        val &= VOYAGER_DISCONNECT_ASIC;
+        err = cat_write(modp, asicp, VOYAGER_SCANPATH, val);
+        if(err) {
+                CDEBUG(("cat_disconnect: failed to write SCANPATH\n"));
+                return err;
+        }
+        outb(VOYAGER_CAT_END, CAT_CMD);
+        outb(VOYAGER_CAT_RUN, CAT_CMD);
+        modp->scan_path_connected = 0;
+        return 0;
+}
+static int
+cat_connect(voyager_module_t *modp, voyager_asic_t *asicp) 
+{
+        __u8 val;
+        int err = 0;
+        if(modp->scan_path_connected)
+                return 0;
+        if(asicp->asic_id != VOYAGER_CAT_ID) {
+                CDEBUG(("cat_connect: ASIC is not CAT\n"));
+                return 1;
+        }
+        err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val);
+        if(err) {
+                CDEBUG(("cat_connect: failed to read SCANPATH\n"));
+                return err;
+        }
+        val |= VOYAGER_CONNECT_ASIC;
+        err = cat_write(modp, asicp, VOYAGER_SCANPATH, val);
+        if(err) {
+                CDEBUG(("cat_connect: failed to write SCANPATH\n"));
+                return err;
+        }
+        outb(VOYAGER_CAT_END, CAT_CMD);
+        outb(VOYAGER_CAT_RUN, CAT_CMD);
+        modp->scan_path_connected = 1;
+        return 0;
+}
+void
+voyager_cat_power_off(void)
+{
+        /* Power the machine off by writing to the PSI over the CAT
+         * bus */
+        __u8 data;
+        voyager_module_t psi = { 0 };
+        voyager_asic_t psi_asic = { 0 };
+        psi.asic = &psi_asic;
+        psi.asic->asic_id = VOYAGER_CAT_ID;
+        psi.asic->subaddr = VOYAGER_SUBADDR_HI;
+        psi.module_addr = VOYAGER_PSI;
+        psi.scan_path_connected = 0;
+        outb(VOYAGER_CAT_END, CAT_CMD);
+        /* Connect the PSI to the CAT Bus */
+        outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT);
+        outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT);
+        outb(VOYAGER_CAT_RUN, CAT_CMD);
+        cat_disconnect(&psi, &psi_asic);
+        /* Read the status */
+        cat_subread(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data);
+        outb(VOYAGER_CAT_END, CAT_CMD);
+        CDEBUG(("PSI STATUS 0x%x\n", data));
+        /* These two writes are power off prep and perform */
+        data = PSI_CLEAR;
+        outb(VOYAGER_CAT_RUN, CAT_CMD);
+        cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data);
+        outb(VOYAGER_CAT_END, CAT_CMD);
+        data = PSI_POWER_DOWN;
+        outb(VOYAGER_CAT_RUN, CAT_CMD);
+        cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data);
+        outb(VOYAGER_CAT_END, CAT_CMD);
+}
+struct voyager_status voyager_status = { 0 };
+void
+voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data)
+{
+        voyager_module_t psi = { 0 };
+        voyager_asic_t psi_asic = { 0 };
+        psi.asic = &psi_asic;
+        psi.asic->asic_id = VOYAGER_CAT_ID;
+        psi.asic->subaddr = VOYAGER_SUBADDR_HI;
+        psi.module_addr = VOYAGER_PSI;
+        psi.scan_path_connected = 0;
+        outb(VOYAGER_CAT_END, CAT_CMD);
+        /* Connect the PSI to the CAT Bus */
+        outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT);
+        outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT);
+        outb(VOYAGER_CAT_RUN, CAT_CMD);
+        cat_disconnect(&psi, &psi_asic);
+        switch(cmd) {
+        case VOYAGER_PSI_READ:
+                cat_read(&psi, &psi_asic, reg, data);
+                break;
+        case VOYAGER_PSI_WRITE:
+                cat_write(&psi, &psi_asic, reg, *data);
+                break;
+        case VOYAGER_PSI_SUBREAD:
+                cat_subread(&psi, &psi_asic, reg, 1, data);
+                break;
+        case VOYAGER_PSI_SUBWRITE:
+                cat_subwrite(&psi, &psi_asic, reg, 1, data);
+                break;
+        default:
+                printk(KERN_ERR "Voyager PSI, unrecognised command %d\n", cmd);
+                break;
+        }
+        outb(VOYAGER_CAT_END, CAT_CMD);
+}
+void
+voyager_cat_do_common_interrupt(void)
+{
+        /* This is caused either by a memory parity error or something
+         * in the PSI */
+        __u8 data;
+        voyager_module_t psi = { 0 };
+        voyager_asic_t psi_asic = { 0 };
+        struct voyager_psi psi_reg;
+        int i;
+ re_read:
+        psi.asic = &psi_asic;
+        psi.asic->asic_id = VOYAGER_CAT_ID;
+        psi.asic->subaddr = VOYAGER_SUBADDR_HI;
+        psi.module_addr = VOYAGER_PSI;
+        psi.scan_path_connected = 0;
+        outb(VOYAGER_CAT_END, CAT_CMD);
+        /* Connect the PSI to the CAT Bus */
+        outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT);
+        outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT);
+        outb(VOYAGER_CAT_RUN, CAT_CMD);
+        cat_disconnect(&psi, &psi_asic);
+        /* Read the status.  NOTE: Need to read *all* the PSI regs here
+         * otherwise the cmn int will be reasserted */
+        for(i = 0; i < sizeof(psi_reg.regs); i++) {
+                cat_read(&psi, &psi_asic, i, &((__u8 *)&psi_reg.regs)[i]);
+        }
+        outb(VOYAGER_CAT_END, CAT_CMD);
+        if((psi_reg.regs.checkbit & 0x02) == 0) {
+                psi_reg.regs.checkbit |= 0x02;
+                cat_write(&psi, &psi_asic, 5, psi_reg.regs.checkbit);
+                printk("VOYAGER RE-READ PSI\n");
+                goto re_read;
+        }
+        outb(VOYAGER_CAT_RUN, CAT_CMD);
+        for(i = 0; i < sizeof(psi_reg.subregs); i++) {
+                /* This looks strange, but the PSI doesn't do auto increment
+                 * correctly */
+                cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i, 
+                            1, &((__u8 *)&psi_reg.subregs)[i]); 
+        }
+        outb(VOYAGER_CAT_END, CAT_CMD);
+#ifdef VOYAGER_CAT_DEBUG
+        printk("VOYAGER PSI: ");
+        for(i=0; i<sizeof(psi_reg.regs); i++)
+                printk("%02x ", ((__u8 *)&psi_reg.regs)[i]);
+        printk("\n           ");
+        for(i=0; i<sizeof(psi_reg.subregs); i++)
+                printk("%02x ", ((__u8 *)&psi_reg.subregs)[i]);
+        printk("\n");
+#endif
+        if(psi_reg.regs.intstatus & PSI_MON) {
+                /* switch off or power fail */
+                if(psi_reg.subregs.supply & PSI_SWITCH_OFF) {
+                        if(voyager_status.switch_off) {
+                                printk(KERN_ERR "Voyager front panel switch turned off again---Immediate power off!\n");
+                                voyager_cat_power_off();
+                                /* not reached */
+                        } else {
+                                printk(KERN_ERR "Voyager front panel switch turned off\n");
+                                voyager_status.switch_off = 1;
+                                voyager_status.request_from_kernel = 1;
+                                wake_up_process(voyager_thread);
+                        }
+                        /* Tell the hardware we're taking care of the
+                         * shutdown, otherwise it will power the box off
+                         * within 3 seconds of the switch being pressed and,
+                         * which is much more important to us, continue to 
+                         * assert the common interrupt */
+                        data = PSI_CLR_SWITCH_OFF;
+                        outb(VOYAGER_CAT_RUN, CAT_CMD);
+                        cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG,
+                                     1, &data);
+                        outb(VOYAGER_CAT_END, CAT_CMD);
+                } else {
+                        VDEBUG(("Voyager ac fail reg 0x%x\n",
+                                psi_reg.subregs.ACfail));
+                        if((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) {
+                                /* No further update */
+                                return;
+                        }
+#if 0
+                        /* Don't bother trying to find out who failed.
+                         * FIXME: This probably makes the code incorrect on
+                         * anything other than a 345x */
+                        for(i=0; i< 5; i++) {
+                                if( psi_reg.subregs.ACfail &(1<<i)) {
+                                        break;
+                                }
+                        }
+                        printk(KERN_NOTICE "AC FAIL IN SUPPLY %d\n", i);
+#endif
+                        /* DON'T do this: it shuts down the AC PSI 
+                        outb(VOYAGER_CAT_RUN, CAT_CMD);
+                        data = PSI_MASK_MASK | i;
+                        cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK,
+                                     1, &data);
+                        outb(VOYAGER_CAT_END, CAT_CMD);
+                        */
+                        printk(KERN_ERR "Voyager AC power failure\n");
+                        outb(VOYAGER_CAT_RUN, CAT_CMD);
+                        data = PSI_COLD_START;
+                        cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG,
+                                     1, &data);
+                        outb(VOYAGER_CAT_END, CAT_CMD);
+                        voyager_status.power_fail = 1;
+                        voyager_status.request_from_kernel = 1;
+                        wake_up_process(voyager_thread);
+                }
+                
+                
+        } else if(psi_reg.regs.intstatus & PSI_FAULT) {
+                /* Major fault! */
+                printk(KERN_ERR "Voyager PSI Detected major fault, immediate power off!\n");
+                voyager_cat_power_off();
+                /* not reached */
+        } else if(psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM
+                                            | PSI_CURRENT | PSI_DVM
+                                            | PSI_PSCFAULT | PSI_STAT_CHG)) {
+                /* other psi fault */
+                printk(KERN_WARNING "Voyager PSI status 0x%x\n", data);
+                /* clear the PSI fault */
+                outb(VOYAGER_CAT_RUN, CAT_CMD);
+                cat_write(&psi, &psi_asic, VOYAGER_PSI_STATUS_REG, 0);
+                outb(VOYAGER_CAT_END, CAT_CMD);
+        }
+}
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
new file mode 100644
index 000000000000..b87f8548e75a
--- /dev/null
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -0,0 +1,1952 @@
+/* -*- mode: c; c-basic-offset: 8 -*- */
+/* Copyright (C) 1999,2001
+ *
+ * Author: J.E.J.Bottomley@HansenPartnership.com
+ *
+ * linux/arch/i386/kernel/voyager_smp.c
+ *
+ * This file provides all the same external entries as smp.c but uses
+ * the voyager hal to provide the functionality
+ */
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/bootmem.h>
+#include <linux/completion.h>
+#include <asm/desc.h>
+#include <asm/voyager.h>
+#include <asm/vic.h>
+#include <asm/mtrr.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/arch_hooks.h>
+/* TLB state -- visible externally, indexed physically */
+DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 };
+/* CPU IRQ affinity -- set to all ones initially */
+static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1]  = ~0UL };
+/* per CPU data structure (for /proc/cpuinfo et al), visible externally
+ * indexed physically */
+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+EXPORT_SYMBOL(cpu_data);
+/* physical ID of the CPU used to boot the system */
+unsigned char boot_cpu_id;
+/* The memory line addresses for the Quad CPIs */
+struct voyager_qic_cpi *voyager_quad_cpi_addr[NR_CPUS] __cacheline_aligned;
+/* The masks for the Extended VIC processors, filled in by cat_init */
+__u32 voyager_extended_vic_processors = 0;
+/* Masks for the extended Quad processors which cannot be VIC booted */
+__u32 voyager_allowed_boot_processors = 0;
+/* The mask for the Quad Processors (both extended and non-extended) */
+__u32 voyager_quad_processors = 0;
+/* Total count of live CPUs, used in process.c to display
+ * the CPU information and in irq.c for the per CPU irq
+ * activity count.  Finally exported by i386_ksyms.c */
+static int voyager_extended_cpus = 1;
+/* Have we found an SMP box - used by time.c to do the profiling
+   interrupt for timeslicing; do not set to 1 until the per CPU timer
+   interrupt is active */
+int smp_found_config = 0;
+/* Used for the invalidate map that's also checked in the spinlock */
+static volatile unsigned long smp_invalidate_needed;
+/* Bitmask of currently online CPUs - used by setup.c for
+   /proc/cpuinfo, visible externally but still physical */
+cpumask_t cpu_online_map = CPU_MASK_NONE;
+EXPORT_SYMBOL(cpu_online_map);
+/* Bitmask of CPUs present in the system - exported by i386_syms.c, used
+ * by scheduler but indexed physically */
+cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
+/* The internal functions */
+static void send_CPI(__u32 cpuset, __u8 cpi);
+static void ack_CPI(__u8 cpi);
+static int ack_QIC_CPI(__u8 cpi);
+static void ack_special_QIC_CPI(__u8 cpi);
+static void ack_VIC_CPI(__u8 cpi);
+static void send_CPI_allbutself(__u8 cpi);
+static void mask_vic_irq(unsigned int irq);
+static void unmask_vic_irq(unsigned int irq);
+static unsigned int startup_vic_irq(unsigned int irq);
+static void enable_local_vic_irq(unsigned int irq);
+static void disable_local_vic_irq(unsigned int irq);
+static void before_handle_vic_irq(unsigned int irq);
+static void after_handle_vic_irq(unsigned int irq);
+static void set_vic_irq_affinity(unsigned int irq, cpumask_t mask);
+static void ack_vic_irq(unsigned int irq);
+static void vic_enable_cpi(void);
+static void do_boot_cpu(__u8 cpuid);
+static void do_quad_bootstrap(void);
+int hard_smp_processor_id(void);
+int safe_smp_processor_id(void);
+/* Inline functions */
+static inline void
+send_one_QIC_CPI(__u8 cpu, __u8 cpi)
+{
+        voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi =
+                (smp_processor_id() << 16) + cpi;
+}
+static inline void
+send_QIC_CPI(__u32 cpuset, __u8 cpi)
+{
+        int cpu;
+        for_each_online_cpu(cpu) {
+                if(cpuset & (1<<cpu)) {
+#ifdef VOYAGER_DEBUG
+                        if(!cpu_isset(cpu, cpu_online_map))
+                                VDEBUG(("CPU%d sending cpi %d to CPU%d not in cpu_online_map\n", hard_smp_processor_id(), cpi, cpu));
+#endif
+                        send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET);
+                }
+        }
+}
+static inline void
+wrapper_smp_local_timer_interrupt(void)
+{
+        irq_enter();
+        smp_local_timer_interrupt();
+        irq_exit();
+}
+static inline void
+send_one_CPI(__u8 cpu, __u8 cpi)
+{
+        if(voyager_quad_processors & (1<<cpu))
+                send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET);
+        else
+                send_CPI(1<<cpu, cpi);
+}
+static inline void
+send_CPI_allbutself(__u8 cpi)
+{
+        __u8 cpu = smp_processor_id();
+        __u32 mask = cpus_addr(cpu_online_map)[0] & ~(1 << cpu);
+        send_CPI(mask, cpi);
+}
+static inline int
+is_cpu_quad(void)
+{
+        __u8 cpumask = inb(VIC_PROC_WHO_AM_I);
+        return ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER);
+}
+static inline int
+is_cpu_extended(void)
+{
+        __u8 cpu = hard_smp_processor_id();
+        return(voyager_extended_vic_processors & (1<<cpu));
+}
+static inline int
+is_cpu_vic_boot(void)
+{
+        __u8 cpu = hard_smp_processor_id();
+        return(voyager_extended_vic_processors
+               & voyager_allowed_boot_processors & (1<<cpu));
+}
+static inline void
+ack_CPI(__u8 cpi)
+{
+        switch(cpi) {
+        case VIC_CPU_BOOT_CPI:
+                if(is_cpu_quad() && !is_cpu_vic_boot())
+                        ack_QIC_CPI(cpi);
+                else
+                        ack_VIC_CPI(cpi);
+                break;
+        case VIC_SYS_INT:
+        case VIC_CMN_INT: 
+                /* These are slightly strange.  Even on the Quad card,
+                 * They are vectored as VIC CPIs */
+                if(is_cpu_quad())
+                        ack_special_QIC_CPI(cpi);
+                else
+                        ack_VIC_CPI(cpi);
+                break;
+        default:
+                printk("VOYAGER ERROR: CPI%d is in common CPI code\n", cpi);
+                break;
+        }
+}
+/* local variables */
+/* The VIC IRQ descriptors -- these look almost identical to the
+ * 8259 IRQs except that masks and things must be kept per processor
+ */
+static struct irq_chip vic_chip = {
+        .name           = "VIC",
+        .startup        = startup_vic_irq,
+        .mask           = mask_vic_irq,
+        .unmask         = unmask_vic_irq,
+        .set_affinity   = set_vic_irq_affinity,
+};
+/* used to count up as CPUs are brought on line (starts at 0) */
+static int cpucount = 0;
+/* steal a page from the bottom of memory for the trampoline and
+ * squirrel its address away here.  This will be in kernel virtual
+ * space */
+static __u32 trampoline_base;
+/* The per cpu profile stuff - used in smp_local_timer_interrupt */
+static DEFINE_PER_CPU(int, prof_multiplier) = 1;
+static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
+static DEFINE_PER_CPU(int, prof_counter) =  1;
+/* the map used to check if a CPU has booted */
+static __u32 cpu_booted_map;
+/* the synchronize flag used to hold all secondary CPUs spinning in
+ * a tight loop until the boot sequence is ready for them */
+static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
+/* This is for the new dynamic CPU boot code */
+cpumask_t cpu_callin_map = CPU_MASK_NONE;
+cpumask_t cpu_callout_map = CPU_MASK_NONE;
+EXPORT_SYMBOL(cpu_callout_map);
+cpumask_t cpu_possible_map = CPU_MASK_NONE;
+EXPORT_SYMBOL(cpu_possible_map);
+/* The per processor IRQ masks (these are usually kept in sync) */
+static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
+/* the list of IRQs to be enabled by the VIC_ENABLE_IRQ_CPI */
+static __u16 vic_irq_enable_mask[NR_CPUS] __cacheline_aligned = { 0 };
+/* Lock for enable/disable of VIC interrupts */
+static  __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock);
+/* The boot processor is correctly set up in PC mode when it 
+ * comes up, but the secondaries need their master/slave 8259
+ * pairs initializing correctly */
+/* Interrupt counters (per cpu) and total - used to try to
+ * even up the interrupt handling routines */
+static long vic_intr_total = 0;
+static long vic_intr_count[NR_CPUS] __cacheline_aligned = { 0 };
+static unsigned long vic_tick[NR_CPUS] __cacheline_aligned = { 0 };
+/* Since we can only use CPI0, we fake all the other CPIs */
+static unsigned long vic_cpi_mailbox[NR_CPUS] __cacheline_aligned;
+/* debugging routine to read the isr of the cpu's pic */
+static inline __u16
+vic_read_isr(void)
+{
+        __u16 isr;
+        outb(0x0b, 0xa0);
+        isr = inb(0xa0) << 8;
+        outb(0x0b, 0x20);
+        isr |= inb(0x20);
+        return isr;
+}
+static __init void
+qic_setup(void)
+{
+        if(!is_cpu_quad()) {
+                /* not a quad, no setup */
+                return;
+        }
+        outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0);
+        outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1);
+        
+        if(is_cpu_extended()) {
+                /* the QIC duplicate of the VIC base register */
+                outb(VIC_DEFAULT_CPI_BASE, QIC_VIC_CPI_BASE_REGISTER);
+                outb(QIC_DEFAULT_CPI_BASE, QIC_CPI_BASE_REGISTER);
+                /* FIXME: should set up the QIC timer and memory parity
+                 * error vectors here */
+        }
+}
+static __init void
+vic_setup_pic(void)
+{
+        outb(1, VIC_REDIRECT_REGISTER_1);
+        /* clear the claim registers for dynamic routing */
+        outb(0, VIC_CLAIM_REGISTER_0);
+        outb(0, VIC_CLAIM_REGISTER_1);
+        outb(0, VIC_PRIORITY_REGISTER);
+        /* Set the Primary and Secondary Microchannel vector
+         * bases to be the same as the ordinary interrupts
+         *
+         * FIXME: This would be more efficient using separate
+         * vectors. */
+        outb(FIRST_EXTERNAL_VECTOR, VIC_PRIMARY_MC_BASE);
+        outb(FIRST_EXTERNAL_VECTOR, VIC_SECONDARY_MC_BASE);
+        /* Now initiallise the master PIC belonging to this CPU by
+         * sending the four ICWs */
+        /* ICW1: level triggered, ICW4 needed */
+        outb(0x19, 0x20);
+        /* ICW2: vector base */
+        outb(FIRST_EXTERNAL_VECTOR, 0x21);
+        /* ICW3: slave at line 2 */
+        outb(0x04, 0x21);
+        /* ICW4: 8086 mode */
+        outb(0x01, 0x21);
+        /* now the same for the slave PIC */
+        /* ICW1: level trigger, ICW4 needed */
+        outb(0x19, 0xA0);
+        /* ICW2: slave vector base */
+        outb(FIRST_EXTERNAL_VECTOR + 8, 0xA1);
+        
+        /* ICW3: slave ID */
+        outb(0x02, 0xA1);
+        /* ICW4: 8086 mode */
+        outb(0x01, 0xA1);
+}
+static void
+do_quad_bootstrap(void)
+{
+        if(is_cpu_quad() && is_cpu_vic_boot()) {
+                int i;
+                unsigned long flags;
+                __u8 cpuid = hard_smp_processor_id();
+                local_irq_save(flags);
+                for(i = 0; i<4; i++) {
+                        /* FIXME: this would be >>3 &0x7 on the 32 way */
+                        if(((cpuid >> 2) & 0x03) == i)
+                                /* don't lower our own mask! */
+                                continue;
+                        /* masquerade as local Quad CPU */
+                        outb(QIC_CPUID_ENABLE | i, QIC_PROCESSOR_ID);
+                        /* enable the startup CPI */
+                        outb(QIC_BOOT_CPI_MASK, QIC_MASK_REGISTER1);
+                        /* restore cpu id */
+                        outb(0, QIC_PROCESSOR_ID);
+                }
+                local_irq_restore(flags);
+        }
+}
+/* Set up all the basic stuff: read the SMP config and make all the
+ * SMP information reflect only the boot cpu.  All others will be
+ * brought on-line later. */
+void __init 
+find_smp_config(void)
+{
+        int i;
+        boot_cpu_id = hard_smp_processor_id();
+        printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id);
+        /* initialize the CPU structures (moved from smp_boot_cpus) */
+        for(i=0; i<NR_CPUS; i++) {
+                cpu_irq_affinity[i] = ~0;
+        }
+        cpu_online_map = cpumask_of_cpu(boot_cpu_id);
+        /* The boot CPU must be extended */
+        voyager_extended_vic_processors = 1<<boot_cpu_id;
+        /* initially, all of the first 8 cpu's can boot */
+        voyager_allowed_boot_processors = 0xff;
+        /* set up everything for just this CPU, we can alter
+         * this as we start the other CPUs later */
+        /* now get the CPU disposition from the extended CMOS */
+        cpus_addr(phys_cpu_present_map)[0] = voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK);
+        cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8;
+        cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 2) << 16;
+        cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 3) << 24;
+        cpu_possible_map = phys_cpu_present_map;
+        printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n", cpus_addr(phys_cpu_present_map)[0]);
+        /* Here we set up the VIC to enable SMP */
+        /* enable the CPIs by writing the base vector to their register */
+        outb(VIC_DEFAULT_CPI_BASE, VIC_CPI_BASE_REGISTER);
+        outb(1, VIC_REDIRECT_REGISTER_1);
+        /* set the claim registers for static routing --- Boot CPU gets
+         * all interrupts untill all other CPUs started */
+        outb(0xff, VIC_CLAIM_REGISTER_0);
+        outb(0xff, VIC_CLAIM_REGISTER_1);
+        /* Set the Primary and Secondary Microchannel vector
+         * bases to be the same as the ordinary interrupts
+         *
+         * FIXME: This would be more efficient using separate
+         * vectors. */
+        outb(FIRST_EXTERNAL_VECTOR, VIC_PRIMARY_MC_BASE);
+        outb(FIRST_EXTERNAL_VECTOR, VIC_SECONDARY_MC_BASE);
+        /* Finally tell the firmware that we're driving */
+        outb(inb(VOYAGER_SUS_IN_CONTROL_PORT) | VOYAGER_IN_CONTROL_FLAG,
+             VOYAGER_SUS_IN_CONTROL_PORT);
+        current_thread_info()->cpu = boot_cpu_id;
+        x86_write_percpu(cpu_number, boot_cpu_id);
+}
+/*
+ *      The bootstrap kernel entry code has set these up. Save them
+ *      for a given CPU, id is physical */
+void __init
+smp_store_cpu_info(int id)
+{
+        struct cpuinfo_x86 *c=&cpu_data[id];
+        *c = boot_cpu_data;
+        identify_secondary_cpu(c);
+}
+/* set up the trampoline and return the physical address of the code */
+static __u32 __init
+setup_trampoline(void)
+{
+        /* these two are global symbols in trampoline.S */
+        extern __u8 trampoline_end[];
+        extern __u8 trampoline_data[];
+        memcpy((__u8 *)trampoline_base, trampoline_data,
+               trampoline_end - trampoline_data);
+        return virt_to_phys((__u8 *)trampoline_base);
+}
+/* Routine initially called when a non-boot CPU is brought online */
+static void __init
+start_secondary(void *unused)
+{
+        __u8 cpuid = hard_smp_processor_id();
+        /* external functions not defined in the headers */
+        extern void calibrate_delay(void);
+        cpu_init();
+        /* OK, we're in the routine */
+        ack_CPI(VIC_CPU_BOOT_CPI);
+        /* setup the 8259 master slave pair belonging to this CPU ---
+         * we won't actually receive any until the boot CPU
+         * relinquishes it's static routing mask */
+        vic_setup_pic();
+        qic_setup();
+        if(is_cpu_quad() && !is_cpu_vic_boot()) {
+                /* clear the boot CPI */
+                __u8 dummy;
+                dummy = voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi;
+                printk("read dummy %d\n", dummy);
+        }
+        /* lower the mask to receive CPIs */
+        vic_enable_cpi();
+        VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
+        /* enable interrupts */
+        local_irq_enable();
+        /* get our bogomips */
+        calibrate_delay();
+        /* save our processor parameters */
+        smp_store_cpu_info(cpuid);
+        /* if we're a quad, we may need to bootstrap other CPUs */
+        do_quad_bootstrap();
+        /* FIXME: this is rather a poor hack to prevent the CPU
+         * activating softirqs while it's supposed to be waiting for
+         * permission to proceed.  Without this, the new per CPU stuff
+         * in the softirqs will fail */
+        local_irq_disable();
+        cpu_set(cpuid, cpu_callin_map);
+        /* signal that we're done */
+        cpu_booted_map = 1;
+        while (!cpu_isset(cpuid, smp_commenced_mask))
+                rep_nop();
+        local_irq_enable();
+        local_flush_tlb();
+        cpu_set(cpuid, cpu_online_map);
+        wmb();
+        cpu_idle();
+}
+/* Routine to kick start the given CPU and wait for it to report ready
+ * (or timeout in startup).  When this routine returns, the requested
+ * CPU is either fully running and configured or known to be dead.
+ *
+ * We call this routine sequentially 1 CPU at a time, so no need for
+ * locking */
+static void __init
+do_boot_cpu(__u8 cpu)
+{
+        struct task_struct *idle;
+        int timeout;
+        unsigned long flags;
+        int quad_boot = (1<<cpu) & voyager_quad_processors 
+                & ~( voyager_extended_vic_processors
+                     & voyager_allowed_boot_processors);
+        /* This is an area in head.S which was used to set up the
+         * initial kernel stack.  We need to alter this to give the
+         * booting CPU a new stack (taken from its idle process) */
+        extern struct {
+                __u8 *esp;
+                unsigned short ss;
+        } stack_start;
+        /* This is the format of the CPI IDT gate (in real mode) which
+         * we're hijacking to boot the CPU */
+        union   IDTFormat {
+                struct seg {
+                        __u16   Offset;
+                        __u16   Segment;
+                } idt;
+                __u32 val;
+        } hijack_source;
+        __u32 *hijack_vector;
+        __u32 start_phys_address = setup_trampoline();
+        /* There's a clever trick to this: The linux trampoline is
+         * compiled to begin at absolute location zero, so make the
+         * address zero but have the data segment selector compensate
+         * for the actual address */
+        hijack_source.idt.Offset = start_phys_address & 0x000F;
+        hijack_source.idt.Segment = (start_phys_address >> 4) & 0xFFFF;
+        cpucount++;
+        alternatives_smp_switch(1);
+        idle = fork_idle(cpu);
+        if(IS_ERR(idle))
+                panic("failed fork for CPU%d", cpu);
+        idle->thread.eip = (unsigned long) start_secondary;
+        /* init_tasks (in sched.c) is indexed logically */
+        stack_start.esp = (void *) idle->thread.esp;
+        init_gdt(cpu);
+        per_cpu(current_task, cpu) = idle;
+        early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+        irq_ctx_init(cpu);
+        /* Note: Don't modify initial ss override */
+        VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu, 
+                (unsigned long)hijack_source.val, hijack_source.idt.Segment,
+                hijack_source.idt.Offset, stack_start.esp));
+        /* init lowmem identity mapping */
+        clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+                        min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
+        flush_tlb_all();
+        if(quad_boot) {
+                printk("CPU %d: non extended Quad boot\n", cpu);
+                hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE)*4);
+                *hijack_vector = hijack_source.val;
+        } else {
+                printk("CPU%d: extended VIC boot\n", cpu);
+                hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE)*4);
+                *hijack_vector = hijack_source.val;
+                /* VIC errata, may also receive interrupt at this address */
+                hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI + VIC_DEFAULT_CPI_BASE)*4);
+                *hijack_vector = hijack_source.val;
+        }
+        /* All non-boot CPUs start with interrupts fully masked.  Need
+         * to lower the mask of the CPI we're about to send.  We do
+         * this in the VIC by masquerading as the processor we're
+         * about to boot and lowering its interrupt mask */
+        local_irq_save(flags);
+        if(quad_boot) {
+                send_one_QIC_CPI(cpu, VIC_CPU_BOOT_CPI);
+        } else {
+                outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID);
+                /* here we're altering registers belonging to `cpu' */
+                
+                outb(VIC_BOOT_INTERRUPT_MASK, 0x21);
+                /* now go back to our original identity */
+                outb(boot_cpu_id, VIC_PROCESSOR_ID);
+                /* and boot the CPU */
+                send_CPI((1<<cpu), VIC_CPU_BOOT_CPI);
+        }
+        cpu_booted_map = 0;
+        local_irq_restore(flags);
+        /* now wait for it to become ready (or timeout) */
+        for(timeout = 0; timeout < 50000; timeout++) {
+                if(cpu_booted_map)
+                        break;
+                udelay(100);
+        }
+        /* reset the page table */
+        zap_low_mappings();
+          
+        if (cpu_booted_map) {
+                VDEBUG(("CPU%d: Booted successfully, back in CPU %d\n",
+                        cpu, smp_processor_id()));
+        
+                printk("CPU%d: ", cpu);
+                print_cpu_info(&cpu_data[cpu]);
+                wmb();
+                cpu_set(cpu, cpu_callout_map);
+                cpu_set(cpu, cpu_present_map);
+        }
+        else {
+                printk("CPU%d FAILED TO BOOT: ", cpu);
+                if (*((volatile unsigned char *)phys_to_virt(start_phys_address))==0xA5)
+                        printk("Stuck.\n");
+                else
+                        printk("Not responding.\n");
+                
+                cpucount--;
+        }
+}
+void __init
+smp_boot_cpus(void)
+{
+        int i;
+        /* CAT BUS initialisation must be done after the memory */
+        /* FIXME: The L4 has a catbus too, it just needs to be
+         * accessed in a totally different way */
+        if(voyager_level == 5) {
+                voyager_cat_init();
+                /* now that the cat has probed the Voyager System Bus, sanity
+                 * check the cpu map */
+                if( ((voyager_quad_processors | voyager_extended_vic_processors)
+                     & cpus_addr(phys_cpu_present_map)[0]) != cpus_addr(phys_cpu_present_map)[0]) {
+                        /* should panic */
+                        printk("\n\n***WARNING*** Sanity check of CPU present map FAILED\n");
+                }
+        } else if(voyager_level == 4)
+                voyager_extended_vic_processors = cpus_addr(phys_cpu_present_map)[0];
+        /* this sets up the idle task to run on the current cpu */
+        voyager_extended_cpus = 1;
+        /* Remove the global_irq_holder setting, it triggers a BUG() on
+         * schedule at the moment */
+        //global_irq_holder = boot_cpu_id;
+        /* FIXME: Need to do something about this but currently only works
+         * on CPUs with a tsc which none of mine have. 
+        smp_tune_scheduling();
+         */
+        smp_store_cpu_info(boot_cpu_id);
+        printk("CPU%d: ", boot_cpu_id);
+        print_cpu_info(&cpu_data[boot_cpu_id]);
+        if(is_cpu_quad()) {
+                /* booting on a Quad CPU */
+                printk("VOYAGER SMP: Boot CPU is Quad\n");
+                qic_setup();
+                do_quad_bootstrap();
+        }
+        /* enable our own CPIs */
+        vic_enable_cpi();
+        cpu_set(boot_cpu_id, cpu_online_map);
+        cpu_set(boot_cpu_id, cpu_callout_map);
+        
+        /* loop over all the extended VIC CPUs and boot them.  The 
+         * Quad CPUs must be bootstrapped by their extended VIC cpu */
+        for(i = 0; i < NR_CPUS; i++) {
+                if(i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map))
+                        continue;
+                do_boot_cpu(i);
+                /* This udelay seems to be needed for the Quad boots
+                 * don't remove unless you know what you're doing */
+                udelay(1000);
+        }
+        /* we could compute the total bogomips here, but why bother?,
+         * Code added from smpboot.c */
+        {
+                unsigned long bogosum = 0;
+                for (i = 0; i < NR_CPUS; i++)
+                        if (cpu_isset(i, cpu_online_map))
+                                bogosum += cpu_data[i].loops_per_jiffy;
+                printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+                        cpucount+1,
+                        bogosum/(500000/HZ),
+                        (bogosum/(5000/HZ))%100);
+        }
+        voyager_extended_cpus = hweight32(voyager_extended_vic_processors);
+        printk("VOYAGER: Extended (interrupt handling CPUs): %d, non-extended: %d\n", voyager_extended_cpus, num_booting_cpus() - voyager_extended_cpus);
+        /* that's it, switch to symmetric mode */
+        outb(0, VIC_PRIORITY_REGISTER);
+        outb(0, VIC_CLAIM_REGISTER_0);
+        outb(0, VIC_CLAIM_REGISTER_1);
+        
+        VDEBUG(("VOYAGER SMP: Booted with %d CPUs\n", num_booting_cpus()));
+}
+/* Reload the secondary CPUs task structure (this function does not
+ * return ) */
+void __init 
+initialize_secondary(void)
+{
+#if 0
+        // AC kernels only
+        set_current(hard_get_current());
+#endif
+        /*
+         * We don't actually need to load the full TSS,
+         * basically just the stack pointer and the eip.
+         */
+        asm volatile(
+                "movl %0,%%esp\n\t"
+                "jmp *%1"
+                :
+                :"r" (current->thread.esp),"r" (current->thread.eip));
+}
+/* handle a Voyager SYS_INT -- If we don't, the base board will
+ * panic the system.
+ *
+ * System interrupts occur because some problem was detected on the
+ * various busses.  To find out what you have to probe all the
+ * hardware via the CAT bus.  FIXME: At the moment we do nothing. */
+fastcall void
+smp_vic_sys_interrupt(struct pt_regs *regs)
+{
+        ack_CPI(VIC_SYS_INT);
+        printk("Voyager SYSTEM INTERRUPT\n");   
+}
+/* Handle a voyager CMN_INT; These interrupts occur either because of
+ * a system status change or because a single bit memory error
+ * occurred.  FIXME: At the moment, ignore all this. */
+fastcall void
+smp_vic_cmn_interrupt(struct pt_regs *regs)
+{
+        static __u8 in_cmn_int = 0;
+        static DEFINE_SPINLOCK(cmn_int_lock);
+        /* common ints are broadcast, so make sure we only do this once */
+        _raw_spin_lock(&cmn_int_lock);
+        if(in_cmn_int)
+                goto unlock_end;
+        in_cmn_int++;
+        _raw_spin_unlock(&cmn_int_lock);
+        VDEBUG(("Voyager COMMON INTERRUPT\n"));
+        if(voyager_level == 5)
+                voyager_cat_do_common_interrupt();
+        _raw_spin_lock(&cmn_int_lock);
+        in_cmn_int = 0;
+ unlock_end:
+        _raw_spin_unlock(&cmn_int_lock);
+        ack_CPI(VIC_CMN_INT);
+}
+/*
+ * Reschedule call back. Nothing to do, all the work is done
+ * automatically when we return from the interrupt.  */
+static void
+smp_reschedule_interrupt(void)
+{
+        /* do nothing */
+}
+static struct mm_struct * flush_mm;
+static unsigned long flush_va;
+static DEFINE_SPINLOCK(tlbstate_lock);
+#define FLUSH_ALL       0xffffffff
+/*
+ * We cannot call mmdrop() because we are in interrupt context, 
+ * instead update mm->cpu_vm_mask.
+ *
+ * We need to reload %cr3 since the page tables may be going
+ * away from under us..
+ */
+static inline void
+leave_mm (unsigned long cpu)
+{
+        if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
+                BUG();
+        cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
+        load_cr3(swapper_pg_dir);
+}
+/*
+ * Invalidate call-back
+ */
+static void 
+smp_invalidate_interrupt(void)
+{
+        __u8 cpu = smp_processor_id();
+        if (!test_bit(cpu, &smp_invalidate_needed))
+                return;
+        /* This will flood messages.  Don't uncomment unless you see
+         * Problems with cross cpu invalidation
+        VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n",
+                smp_processor_id()));
+        */
+        if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
+                if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
+                        if (flush_va == FLUSH_ALL)
+                                local_flush_tlb();
+                        else
+                                __flush_tlb_one(flush_va);
+                } else
+                        leave_mm(cpu);
+        }
+        smp_mb__before_clear_bit();
+        clear_bit(cpu, &smp_invalidate_needed);
+        smp_mb__after_clear_bit();
+}
+/* All the new flush operations for 2.4 */
+/* This routine is called with a physical cpu mask */
+static void
+voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm,
+                          unsigned long va)
+{
+        int stuck = 50000;
+        if (!cpumask)
+                BUG();
+        if ((cpumask & cpus_addr(cpu_online_map)[0]) != cpumask)
+                BUG();
+        if (cpumask & (1 << smp_processor_id()))
+                BUG();
+        if (!mm)
+                BUG();
+        spin_lock(&tlbstate_lock);
+        
+        flush_mm = mm;
+        flush_va = va;
+        atomic_set_mask(cpumask, &smp_invalidate_needed);
+        /*
+         * We have to send the CPI only to
+         * CPUs affected.
+         */
+        send_CPI(cpumask, VIC_INVALIDATE_CPI);
+        while (smp_invalidate_needed) {
+                mb();
+                if(--stuck == 0) {
+                        printk("***WARNING*** Stuck doing invalidate CPI (CPU%d)\n", smp_processor_id());
+                        break;
+                }
+        }
+        /* Uncomment only to debug invalidation problems
+        VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu));
+        */
+        flush_mm = NULL;
+        flush_va = 0;
+        spin_unlock(&tlbstate_lock);
+}
+void
+flush_tlb_current_task(void)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long cpu_mask;
+        preempt_disable();
+        cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
+        local_flush_tlb();
+        if (cpu_mask)
+                voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+        preempt_enable();
+}
+void
+flush_tlb_mm (struct mm_struct * mm)
+{
+        unsigned long cpu_mask;
+        preempt_disable();
+        cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
+        if (current->active_mm == mm) {
+                if (current->mm)
+                        local_flush_tlb();
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (cpu_mask)
+                voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+        preempt_enable();
+}
+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long cpu_mask;
+        preempt_disable();
+        cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
+        if (current->active_mm == mm) {
+                if(current->mm)
+                        __flush_tlb_one(va);
+                 else
+                        leave_mm(smp_processor_id());
+        }
+        if (cpu_mask)
+                voyager_flush_tlb_others(cpu_mask, mm, va);
+        preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_page);
+/* enable the requested IRQs */
+static void
+smp_enable_irq_interrupt(void)
+{
+        __u8 irq;
+        __u8 cpu = get_cpu();
+        VDEBUG(("VOYAGER SMP: CPU%d enabling irq mask 0x%x\n", cpu,
+               vic_irq_enable_mask[cpu]));
+        spin_lock(&vic_irq_lock);
+        for(irq = 0; irq < 16; irq++) {
+                if(vic_irq_enable_mask[cpu] & (1<<irq))
+                        enable_local_vic_irq(irq);
+        }
+        vic_irq_enable_mask[cpu] = 0;
+        spin_unlock(&vic_irq_lock);
+        put_cpu_no_resched();
+}
+        
+/*
+ *      CPU halt call-back
+ */
+static void
+smp_stop_cpu_function(void *dummy)
+{
+        VDEBUG(("VOYAGER SMP: CPU%d is STOPPING\n", smp_processor_id()));
+        cpu_clear(smp_processor_id(), cpu_online_map);
+        local_irq_disable();
+        for(;;)
+                halt();
+}
+static DEFINE_SPINLOCK(call_lock);
+struct call_data_struct {
+        void (*func) (void *info);
+        void *info;
+        volatile unsigned long started;
+        volatile unsigned long finished;
+        int wait;
+};
+static struct call_data_struct * call_data;
+/* execute a thread on a new CPU.  The function to be called must be
+ * previously set up.  This is used to schedule a function for
+ * execution on all CPU's - set up the function then broadcast a
+ * function_interrupt CPI to come here on each CPU */
+static void
+smp_call_function_interrupt(void)
+{
+        void (*func) (void *info) = call_data->func;
+        void *info = call_data->info;
+        /* must take copy of wait because call_data may be replaced
+         * unless the function is waiting for us to finish */
+        int wait = call_data->wait;
+        __u8 cpu = smp_processor_id();
+        /*
+         * Notify initiating CPU that I've grabbed the data and am
+         * about to execute the function
+         */
+        mb();
+        if(!test_and_clear_bit(cpu, &call_data->started)) {
+                /* If the bit wasn't set, this could be a replay */
+                printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion with no call pending\n", cpu);
+                return;
+        }
+        /*
+         * At this point the info structure may be out of scope unless wait==1
+         */
+        irq_enter();
+        (*func)(info);
+        irq_exit();
+        if (wait) {
+                mb();
+                clear_bit(cpu, &call_data->finished);
+        }
+}
+static int
+voyager_smp_call_function_mask (cpumask_t cpumask,
+                                void (*func) (void *info), void *info,
+                                int wait)
+{
+        struct call_data_struct data;
+        u32 mask = cpus_addr(cpumask)[0];
+        mask &= ~(1<<smp_processor_id());
+        if (!mask)
+                return 0;
+        /* Can deadlock when called with interrupts disabled */
+        WARN_ON(irqs_disabled());
+        data.func = func;
+        data.info = info;
+        data.started = mask;
+        data.wait = wait;
+        if (wait)
+                data.finished = mask;
+        spin_lock(&call_lock);
+        call_data = &data;
+        wmb();
+        /* Send a message to all other CPUs and wait for them to respond */
+        send_CPI(mask, VIC_CALL_FUNCTION_CPI);
+        /* Wait for response */
+        while (data.started)
+                barrier();
+        if (wait)
+                while (data.finished)
+                        barrier();
+        spin_unlock(&call_lock);
+        return 0;
+}
+/* Sorry about the name.  In an APIC based system, the APICs
+ * themselves are programmed to send a timer interrupt.  This is used
+ * by linux to reschedule the processor.  Voyager doesn't have this,
+ * so we use the system clock to interrupt one processor, which in
+ * turn, broadcasts a timer CPI to all the others --- we receive that
+ * CPI here.  We don't use this actually for counting so losing
+ * ticks doesn't matter 
+ *
+ * FIXME: For those CPU's which actually have a local APIC, we could
+ * try to use it to trigger this interrupt instead of having to
+ * broadcast the timer tick.  Unfortunately, all my pentium DYADs have
+ * no local APIC, so I can't do this
+ *
+ * This function is currently a placeholder and is unused in the code */
+fastcall void 
+smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+        struct pt_regs *old_regs = set_irq_regs(regs);
+        wrapper_smp_local_timer_interrupt();
+        set_irq_regs(old_regs);
+}
+/* All of the QUAD interrupt GATES */
+fastcall void
+smp_qic_timer_interrupt(struct pt_regs *regs)
+{
+        struct pt_regs *old_regs = set_irq_regs(regs);
+        ack_QIC_CPI(QIC_TIMER_CPI);
+        wrapper_smp_local_timer_interrupt();
+        set_irq_regs(old_regs);
+}
+fastcall void
+smp_qic_invalidate_interrupt(struct pt_regs *regs)
+{
+        ack_QIC_CPI(QIC_INVALIDATE_CPI);
+        smp_invalidate_interrupt();
+}
+fastcall void
+smp_qic_reschedule_interrupt(struct pt_regs *regs)
+{
+        ack_QIC_CPI(QIC_RESCHEDULE_CPI);
+        smp_reschedule_interrupt();
+}
+fastcall void
+smp_qic_enable_irq_interrupt(struct pt_regs *regs)
+{
+        ack_QIC_CPI(QIC_ENABLE_IRQ_CPI);
+        smp_enable_irq_interrupt();
+}
+fastcall void
+smp_qic_call_function_interrupt(struct pt_regs *regs)
+{
+        ack_QIC_CPI(QIC_CALL_FUNCTION_CPI);
+        smp_call_function_interrupt();
+}
+fastcall void
+smp_vic_cpi_interrupt(struct pt_regs *regs)
+{
+        struct pt_regs *old_regs = set_irq_regs(regs);
+        __u8 cpu = smp_processor_id();
+        if(is_cpu_quad())
+                ack_QIC_CPI(VIC_CPI_LEVEL0);
+        else
+                ack_VIC_CPI(VIC_CPI_LEVEL0);
+        if(test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu]))
+                wrapper_smp_local_timer_interrupt();
+        if(test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu]))
+                smp_invalidate_interrupt();
+        if(test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu]))
+                smp_reschedule_interrupt();
+        if(test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu]))
+                smp_enable_irq_interrupt();
+        if(test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu]))
+                smp_call_function_interrupt();
+        set_irq_regs(old_regs);
+}
+static void
+do_flush_tlb_all(void* info)
+{
+        unsigned long cpu = smp_processor_id();
+        __flush_tlb_all();
+        if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
+                leave_mm(cpu);
+}
+/* flush the TLB of every active CPU in the system */
+void
+flush_tlb_all(void)
+{
+        on_each_cpu(do_flush_tlb_all, 0, 1, 1);
+}
+/* used to set up the trampoline for other CPUs when the memory manager
+ * is sorted out */
+void __init
+smp_alloc_memory(void)
+{
+        trampoline_base = (__u32)alloc_bootmem_low_pages(PAGE_SIZE);
+        if(__pa(trampoline_base) >= 0x93000)
+                BUG();
+}
+/* send a reschedule CPI to one CPU by physical CPU number*/
+static void
+voyager_smp_send_reschedule(int cpu)
+{
+        send_one_CPI(cpu, VIC_RESCHEDULE_CPI);
+}
+int
+hard_smp_processor_id(void)
+{
+        __u8 i;
+        __u8 cpumask = inb(VIC_PROC_WHO_AM_I);
+        if((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER)
+                return cpumask & 0x1F;
+        for(i = 0; i < 8; i++) {
+                if(cpumask & (1<<i))
+                        return i;
+        }
+        printk("** WARNING ** Illegal cpuid returned by VIC: %d", cpumask);
+        return 0;
+}
+int
+safe_smp_processor_id(void)
+{
+        return hard_smp_processor_id();
+}
+/* broadcast a halt to all other CPUs */
+static void
+voyager_smp_send_stop(void)
+{
+        smp_call_function(smp_stop_cpu_function, NULL, 1, 1);
+}
+/* this function is triggered in time.c when a clock tick fires
+ * we need to re-broadcast the tick to all CPUs */
+void
+smp_vic_timer_interrupt(void)
+{
+        send_CPI_allbutself(VIC_TIMER_CPI);
+        smp_local_timer_interrupt();
+}
+/* local (per CPU) timer interrupt.  It does both profiling and
+ * process statistics/rescheduling.
+ *
+ * We do profiling in every local tick, statistics/rescheduling
+ * happen only every 'profiling multiplier' ticks. The default
+ * multiplier is 1 and it can be changed by writing the new multiplier
+ * value into /proc/profile.
+ */
+void
+smp_local_timer_interrupt(void)
+{
+        int cpu = smp_processor_id();
+        long weight;
+        profile_tick(CPU_PROFILING);
+        if (--per_cpu(prof_counter, cpu) <= 0) {
+                /*
+                 * The multiplier may have changed since the last time we got
+                 * to this point as a result of the user writing to
+                 * /proc/profile. In this case we need to adjust the APIC
+                 * timer accordingly.
+                 *
+                 * Interrupts are already masked off at this point.
+                 */
+                per_cpu(prof_counter,cpu) = per_cpu(prof_multiplier, cpu);
+                if (per_cpu(prof_counter, cpu) !=
+                                        per_cpu(prof_old_multiplier, cpu)) {
+                        /* FIXME: need to update the vic timer tick here */
+                        per_cpu(prof_old_multiplier, cpu) =
+                                                per_cpu(prof_counter, cpu);
+                }
+                update_process_times(user_mode_vm(get_irq_regs()));
+        }
+        if( ((1<<cpu) & voyager_extended_vic_processors) == 0)
+                /* only extended VIC processors participate in
+                 * interrupt distribution */
+                return;
+        /*
+         * We take the 'long' return path, and there every subsystem
+         * grabs the apropriate locks (kernel lock/ irq lock).
+         *
+         * we might want to decouple profiling from the 'long path',
+         * and do the profiling totally in assembly.
+         *
+         * Currently this isn't too much of an issue (performance wise),
+         * we can take more than 100K local irqs per second on a 100 MHz P5.
+         */
+        if((++vic_tick[cpu] & 0x7) != 0)
+                return;
+        /* get here every 16 ticks (about every 1/6 of a second) */
+        /* Change our priority to give someone else a chance at getting
+         * the IRQ. The algorithm goes like this:
+         *
+         * In the VIC, the dynamically routed interrupt is always
+         * handled by the lowest priority eligible (i.e. receiving
+         * interrupts) CPU.  If >1 eligible CPUs are equal lowest, the
+         * lowest processor number gets it.
+         *
+         * The priority of a CPU is controlled by a special per-CPU
+         * VIC priority register which is 3 bits wide 0 being lowest
+         * and 7 highest priority..
+         *
+         * Therefore we subtract the average number of interrupts from
+         * the number we've fielded.  If this number is negative, we
+         * lower the activity count and if it is positive, we raise
+         * it.
+         *
+         * I'm afraid this still leads to odd looking interrupt counts:
+         * the totals are all roughly equal, but the individual ones
+         * look rather skewed.
+         *
+         * FIXME: This algorithm is total crap when mixed with SMP
+         * affinity code since we now try to even up the interrupt
+         * counts when an affinity binding is keeping them on a
+         * particular CPU*/
+        weight = (vic_intr_count[cpu]*voyager_extended_cpus
+                  - vic_intr_total) >> 4;
+        weight += 4;
+        if(weight > 7)
+                weight = 7;
+        if(weight < 0)
+                weight = 0;
+        
+        outb((__u8)weight, VIC_PRIORITY_REGISTER);
+#ifdef VOYAGER_DEBUG
+        if((vic_tick[cpu] & 0xFFF) == 0) {
+                /* print this message roughly every 25 secs */
+                printk("VOYAGER SMP: vic_tick[%d] = %lu, weight = %ld\n",
+                       cpu, vic_tick[cpu], weight);
+        }
+#endif
+}
+/* setup the profiling timer */
+int 
+setup_profiling_timer(unsigned int multiplier)
+{
+        int i;
+        if ( (!multiplier))
+                return -EINVAL;
+        /* 
+         * Set the new multiplier for each CPU. CPUs don't start using the
+         * new values until the next timer interrupt in which they do process
+         * accounting.
+         */
+        for (i = 0; i < NR_CPUS; ++i)
+                per_cpu(prof_multiplier, i) = multiplier;
+        return 0;
+}
+/* This is a bit of a mess, but forced on us by the genirq changes
+ * there's no genirq handler that really does what voyager wants
+ * so hack it up with the simple IRQ handler */
+static void fastcall
+handle_vic_irq(unsigned int irq, struct irq_desc *desc)
+{
+        before_handle_vic_irq(irq);
+        handle_simple_irq(irq, desc);
+        after_handle_vic_irq(irq);
+}
+/*  The CPIs are handled in the per cpu 8259s, so they must be
+ *  enabled to be received: FIX: enabling the CPIs in the early
+ *  boot sequence interferes with bug checking; enable them later
+ *  on in smp_init */
+#define VIC_SET_GATE(cpi, vector) \
+        set_intr_gate((cpi) + VIC_DEFAULT_CPI_BASE, (vector))
+#define QIC_SET_GATE(cpi, vector) \
+        set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector))
+void __init
+smp_intr_init(void)
+{
+        int i;
+        /* initialize the per cpu irq mask to all disabled */
+        for(i = 0; i < NR_CPUS; i++)
+                vic_irq_mask[i] = 0xFFFF;
+        VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt);
+        VIC_SET_GATE(VIC_SYS_INT, vic_sys_interrupt);
+        VIC_SET_GATE(VIC_CMN_INT, vic_cmn_interrupt);
+        QIC_SET_GATE(QIC_TIMER_CPI, qic_timer_interrupt);
+        QIC_SET_GATE(QIC_INVALIDATE_CPI, qic_invalidate_interrupt);
+        QIC_SET_GATE(QIC_RESCHEDULE_CPI, qic_reschedule_interrupt);
+        QIC_SET_GATE(QIC_ENABLE_IRQ_CPI, qic_enable_irq_interrupt);
+        QIC_SET_GATE(QIC_CALL_FUNCTION_CPI, qic_call_function_interrupt);
+        
+        /* now put the VIC descriptor into the first 48 IRQs 
+         *
+         * This is for later: first 16 correspond to PC IRQs; next 16
+         * are Primary MC IRQs and final 16 are Secondary MC IRQs */
+        for(i = 0; i < 48; i++)
+                set_irq_chip_and_handler(i, &vic_chip, handle_vic_irq);
+}
+/* send a CPI at level cpi to a set of cpus in cpuset (set 1 bit per
+ * processor to receive CPI */
+static void
+send_CPI(__u32 cpuset, __u8 cpi)
+{
+        int cpu;
+        __u32 quad_cpuset = (cpuset & voyager_quad_processors);
+        if(cpi < VIC_START_FAKE_CPI) {
+                /* fake CPI are only used for booting, so send to the 
+                 * extended quads as well---Quads must be VIC booted */
+                outb((__u8)(cpuset), VIC_CPI_Registers[cpi]);
+                return;
+        }
+        if(quad_cpuset)
+                send_QIC_CPI(quad_cpuset, cpi);
+        cpuset &= ~quad_cpuset;
+        cpuset &= 0xff;         /* only first 8 CPUs vaild for VIC CPI */
+        if(cpuset == 0)
+                return;
+        for_each_online_cpu(cpu) {
+                if(cpuset & (1<<cpu))
+                        set_bit(cpi, &vic_cpi_mailbox[cpu]);
+        }
+        if(cpuset)
+                outb((__u8)cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]);
+}
+/* Acknowledge receipt of CPI in the QIC, clear in QIC hardware and
+ * set the cache line to shared by reading it.
+ *
+ * DON'T make this inline otherwise the cache line read will be
+ * optimised away
+ * */
+static int
+ack_QIC_CPI(__u8 cpi) {
+        __u8 cpu = hard_smp_processor_id();
+        cpi &= 7;
+        outb(1<<cpi, QIC_INTERRUPT_CLEAR1);
+        return voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi;
+}
+static void
+ack_special_QIC_CPI(__u8 cpi)
+{
+        switch(cpi) {
+        case VIC_CMN_INT:
+                outb(QIC_CMN_INT, QIC_INTERRUPT_CLEAR0);
+                break;
+        case VIC_SYS_INT:
+                outb(QIC_SYS_INT, QIC_INTERRUPT_CLEAR0);
+                break;
+        }
+        /* also clear at the VIC, just in case (nop for non-extended proc) */
+        ack_VIC_CPI(cpi);
+}
+/* Acknowledge receipt of CPI in the VIC (essentially an EOI) */
+static void
+ack_VIC_CPI(__u8 cpi)
+{
+#ifdef VOYAGER_DEBUG
+        unsigned long flags;
+        __u16 isr;
+        __u8 cpu = smp_processor_id();
+        local_irq_save(flags);
+        isr = vic_read_isr();
+        if((isr & (1<<(cpi &7))) == 0) {
+                printk("VOYAGER SMP: CPU%d lost CPI%d\n", cpu, cpi);
+        }
+#endif
+        /* send specific EOI; the two system interrupts have
+         * bit 4 set for a separate vector but behave as the
+         * corresponding 3 bit intr */
+        outb_p(0x60|(cpi & 7),0x20);
+#ifdef VOYAGER_DEBUG
+        if((vic_read_isr() & (1<<(cpi &7))) != 0) {
+                printk("VOYAGER SMP: CPU%d still asserting CPI%d\n", cpu, cpi);
+        }
+        local_irq_restore(flags);
+#endif
+}
+/* cribbed with thanks from irq.c */
+#define __byte(x,y)     (((unsigned char *)&(y))[x])
+#define cached_21(cpu)  (__byte(0,vic_irq_mask[cpu]))
+#define cached_A1(cpu)  (__byte(1,vic_irq_mask[cpu]))
+static unsigned int
+startup_vic_irq(unsigned int irq)
+{
+        unmask_vic_irq(irq);
+        return 0;
+}
+/* The enable and disable routines.  This is where we run into
+ * conflicting architectural philosophy.  Fundamentally, the voyager
+ * architecture does not expect to have to disable interrupts globally
+ * (the IRQ controllers belong to each CPU).  The processor masquerade
+ * which is used to start the system shouldn't be used in a running OS
+ * since it will cause great confusion if two separate CPUs drive to
+ * the same IRQ controller (I know, I've tried it).
+ *
+ * The solution is a variant on the NCR lazy SPL design:
+ *
+ * 1) To disable an interrupt, do nothing (other than set the
+ *    IRQ_DISABLED flag).  This dares the interrupt actually to arrive.
+ *
+ * 2) If the interrupt dares to come in, raise the local mask against
+ *    it (this will result in all the CPU masks being raised
+ *    eventually).
+ *
+ * 3) To enable the interrupt, lower the mask on the local CPU and
+ *    broadcast an Interrupt enable CPI which causes all other CPUs to
+ *    adjust their masks accordingly.  */
+static void
+unmask_vic_irq(unsigned int irq)
+{
+        /* linux doesn't to processor-irq affinity, so enable on
+         * all CPUs we know about */
+        int cpu = smp_processor_id(), real_cpu;
+        __u16 mask = (1<<irq);
+        __u32 processorList = 0;
+        unsigned long flags;
+        VDEBUG(("VOYAGER: unmask_vic_irq(%d) CPU%d affinity 0x%lx\n",
+                irq, cpu, cpu_irq_affinity[cpu]));
+        spin_lock_irqsave(&vic_irq_lock, flags);
+        for_each_online_cpu(real_cpu) {
+                if(!(voyager_extended_vic_processors & (1<<real_cpu)))
+                        continue;
+                if(!(cpu_irq_affinity[real_cpu] & mask)) {
+                        /* irq has no affinity for this CPU, ignore */
+                        continue;
+                }
+                if(real_cpu == cpu) {
+                        enable_local_vic_irq(irq);
+                }
+                else if(vic_irq_mask[real_cpu] & mask) {
+                        vic_irq_enable_mask[real_cpu] |= mask;
+                        processorList |= (1<<real_cpu);
+                }
+        }
+        spin_unlock_irqrestore(&vic_irq_lock, flags);
+        if(processorList)
+                send_CPI(processorList, VIC_ENABLE_IRQ_CPI);
+}
+static void
+mask_vic_irq(unsigned int irq)
+{
+        /* lazy disable, do nothing */
+}
+static void
+enable_local_vic_irq(unsigned int irq)
+{
+        __u8 cpu = smp_processor_id();
+        __u16 mask = ~(1 << irq);
+        __u16 old_mask = vic_irq_mask[cpu];
+        vic_irq_mask[cpu] &= mask;
+        if(vic_irq_mask[cpu] == old_mask)
+                return;
+        VDEBUG(("VOYAGER DEBUG: Enabling irq %d in hardware on CPU %d\n",
+                irq, cpu));
+        if (irq & 8) {
+                outb_p(cached_A1(cpu),0xA1);
+                (void)inb_p(0xA1);
+        }
+        else {
+                outb_p(cached_21(cpu),0x21);
+                (void)inb_p(0x21);
+        }
+}
+static void
+disable_local_vic_irq(unsigned int irq)
+{
+        __u8 cpu = smp_processor_id();
+        __u16 mask = (1 << irq);
+        __u16 old_mask = vic_irq_mask[cpu];
+        if(irq == 7)
+                return;
+        vic_irq_mask[cpu] |= mask;
+        if(old_mask == vic_irq_mask[cpu])
+                return;
+        VDEBUG(("VOYAGER DEBUG: Disabling irq %d in hardware on CPU %d\n",
+                irq, cpu));
+        if (irq & 8) {
+                outb_p(cached_A1(cpu),0xA1);
+                (void)inb_p(0xA1);
+        }
+        else {
+                outb_p(cached_21(cpu),0x21);
+                (void)inb_p(0x21);
+        }
+}
+/* The VIC is level triggered, so the ack can only be issued after the
+ * interrupt completes.  However, we do Voyager lazy interrupt
+ * handling here: It is an extremely expensive operation to mask an
+ * interrupt in the vic, so we merely set a flag (IRQ_DISABLED).  If
+ * this interrupt actually comes in, then we mask and ack here to push
+ * the interrupt off to another CPU */
+static void
+before_handle_vic_irq(unsigned int irq)
+{
+        irq_desc_t *desc = irq_desc + irq;
+        __u8 cpu = smp_processor_id();
+        _raw_spin_lock(&vic_irq_lock);
+        vic_intr_total++;
+        vic_intr_count[cpu]++;
+        if(!(cpu_irq_affinity[cpu] & (1<<irq))) {
+                /* The irq is not in our affinity mask, push it off
+                 * onto another CPU */
+                VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d on cpu %d\n",
+                        irq, cpu));
+                disable_local_vic_irq(irq);
+                /* set IRQ_INPROGRESS to prevent the handler in irq.c from
+                 * actually calling the interrupt routine */
+                desc->status |= IRQ_REPLAY | IRQ_INPROGRESS;
+        } else if(desc->status & IRQ_DISABLED) {
+                /* Damn, the interrupt actually arrived, do the lazy
+                 * disable thing. The interrupt routine in irq.c will
+                 * not handle a IRQ_DISABLED interrupt, so nothing more
+                 * need be done here */
+                VDEBUG(("VOYAGER DEBUG: lazy disable of irq %d on CPU %d\n",
+                        irq, cpu));
+                disable_local_vic_irq(irq);
+                desc->status |= IRQ_REPLAY;
+        } else {
+                desc->status &= ~IRQ_REPLAY;
+        }
+        _raw_spin_unlock(&vic_irq_lock);
+}
+/* Finish the VIC interrupt: basically mask */
+static void
+after_handle_vic_irq(unsigned int irq)
+{
+        irq_desc_t *desc = irq_desc + irq;
+        _raw_spin_lock(&vic_irq_lock);
+        {
+                unsigned int status = desc->status & ~IRQ_INPROGRESS;
+#ifdef VOYAGER_DEBUG
+                __u16 isr;
+#endif
+                desc->status = status;
+                if ((status & IRQ_DISABLED))
+                        disable_local_vic_irq(irq);
+#ifdef VOYAGER_DEBUG
+                /* DEBUG: before we ack, check what's in progress */
+                isr = vic_read_isr();
+                if((isr & (1<<irq) && !(status & IRQ_REPLAY)) == 0) {
+                        int i;
+                        __u8 cpu = smp_processor_id();
+                        __u8 real_cpu;
+                        int mask; /* Um... initialize me??? --RR */
+                        printk("VOYAGER SMP: CPU%d lost interrupt %d\n",
+                               cpu, irq);
+                        for_each_possible_cpu(real_cpu, mask) {
+                                outb(VIC_CPU_MASQUERADE_ENABLE | real_cpu,
+                                     VIC_PROCESSOR_ID);
+                                isr = vic_read_isr();
+                                if(isr & (1<<irq)) {
+                                        printk("VOYAGER SMP: CPU%d ack irq %d\n",
+                                               real_cpu, irq);
+                                        ack_vic_irq(irq);
+                                }
+                                outb(cpu, VIC_PROCESSOR_ID);
+                        }
+                }
+#endif /* VOYAGER_DEBUG */
+                /* as soon as we ack, the interrupt is eligible for
+                 * receipt by another CPU so everything must be in
+                 * order here  */
+                ack_vic_irq(irq);
+                if(status & IRQ_REPLAY) {
+                        /* replay is set if we disable the interrupt
+                         * in the before_handle_vic_irq() routine, so
+                         * clear the in progress bit here to allow the
+                         * next CPU to handle this correctly */
+                        desc->status &= ~(IRQ_REPLAY | IRQ_INPROGRESS);
+                }
+#ifdef VOYAGER_DEBUG
+                isr = vic_read_isr();
+                if((isr & (1<<irq)) != 0)
+                        printk("VOYAGER SMP: after_handle_vic_irq() after ack irq=%d, isr=0x%x\n",
+                               irq, isr);
+#endif /* VOYAGER_DEBUG */
+        }
+        _raw_spin_unlock(&vic_irq_lock);
+        /* All code after this point is out of the main path - the IRQ
+         * may be intercepted by another CPU if reasserted */
+}
+/* Linux processor - interrupt affinity manipulations.
+ *
+ * For each processor, we maintain a 32 bit irq affinity mask.
+ * Initially it is set to all 1's so every processor accepts every
+ * interrupt.  In this call, we change the processor's affinity mask:
+ *
+ * Change from enable to disable:
+ *
+ * If the interrupt ever comes in to the processor, we will disable it
+ * and ack it to push it off to another CPU, so just accept the mask here.
+ *
+ * Change from disable to enable:
+ *
+ * change the mask and then do an interrupt enable CPI to re-enable on
+ * the selected processors */
+void
+set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+        /* Only extended processors handle interrupts */
+        unsigned long real_mask;
+        unsigned long irq_mask = 1 << irq;
+        int cpu;
+        real_mask = cpus_addr(mask)[0] & voyager_extended_vic_processors;
+        
+        if(cpus_addr(mask)[0] == 0)
+                /* can't have no cpu's to accept the interrupt -- extremely
+                 * bad things will happen */
+                return;
+        if(irq == 0)
+                /* can't change the affinity of the timer IRQ.  This
+                 * is due to the constraint in the voyager
+                 * architecture that the CPI also comes in on and IRQ
+                 * line and we have chosen IRQ0 for this.  If you
+                 * raise the mask on this interrupt, the processor
+                 * will no-longer be able to accept VIC CPIs */
+                return;
+        if(irq >= 32) 
+                /* You can only have 32 interrupts in a voyager system
+                 * (and 32 only if you have a secondary microchannel
+                 * bus) */
+                return;
+        for_each_online_cpu(cpu) {
+                unsigned long cpu_mask = 1 << cpu;
+                
+                if(cpu_mask & real_mask) {
+                        /* enable the interrupt for this cpu */
+                        cpu_irq_affinity[cpu] |= irq_mask;
+                } else {
+                        /* disable the interrupt for this cpu */
+                        cpu_irq_affinity[cpu] &= ~irq_mask;
+                }
+        }
+        /* this is magic, we now have the correct affinity maps, so
+         * enable the interrupt.  This will send an enable CPI to
+         * those cpu's who need to enable it in their local masks,
+         * causing them to correct for the new affinity . If the
+         * interrupt is currently globally disabled, it will simply be
+         * disabled again as it comes in (voyager lazy disable).  If
+         * the affinity map is tightened to disable the interrupt on a
+         * cpu, it will be pushed off when it comes in */
+        unmask_vic_irq(irq);
+}
+static void
+ack_vic_irq(unsigned int irq)
+{
+        if (irq & 8) {
+                outb(0x62,0x20);        /* Specific EOI to cascade */
+                outb(0x60|(irq & 7),0xA0);
+        } else {
+                outb(0x60 | (irq & 7),0x20);
+        }
+}
+/* enable the CPIs.  In the VIC, the CPIs are delivered by the 8259
+ * but are not vectored by it.  This means that the 8259 mask must be
+ * lowered to receive them */
+static __init void
+vic_enable_cpi(void)
+{
+        __u8 cpu = smp_processor_id();
+        
+        /* just take a copy of the current mask (nop for boot cpu) */
+        vic_irq_mask[cpu] = vic_irq_mask[boot_cpu_id];
+        enable_local_vic_irq(VIC_CPI_LEVEL0);
+        enable_local_vic_irq(VIC_CPI_LEVEL1);
+        /* for sys int and cmn int */
+        enable_local_vic_irq(7);
+        if(is_cpu_quad()) {
+                outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0);
+                outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1);
+                VDEBUG(("VOYAGER SMP: QIC ENABLE CPI: CPU%d: MASK 0x%x\n",
+                        cpu, QIC_CPI_ENABLE));
+        }
+        VDEBUG(("VOYAGER SMP: ENABLE CPI: CPU%d: MASK 0x%x\n",
+                cpu, vic_irq_mask[cpu]));
+}
+void
+voyager_smp_dump()
+{
+        int old_cpu = smp_processor_id(), cpu;
+        /* dump the interrupt masks of each processor */
+        for_each_online_cpu(cpu) {
+                __u16 imr, isr, irr;
+                unsigned long flags;
+                local_irq_save(flags);
+                outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID);
+                imr = (inb(0xa1) << 8) | inb(0x21);
+                outb(0x0a, 0xa0);
+                irr = inb(0xa0) << 8;
+                outb(0x0a, 0x20);
+                irr |= inb(0x20);
+                outb(0x0b, 0xa0);
+                isr = inb(0xa0) << 8;
+                outb(0x0b, 0x20);
+                isr |= inb(0x20);
+                outb(old_cpu, VIC_PROCESSOR_ID);
+                local_irq_restore(flags);
+                printk("\tCPU%d: mask=0x%x, IMR=0x%x, IRR=0x%x, ISR=0x%x\n",
+                       cpu, vic_irq_mask[cpu], imr, irr, isr);
+#if 0
+                /* These lines are put in to try to unstick an un ack'd irq */
+                if(isr != 0) {
+                        int irq;
+                        for(irq=0; irq<16; irq++) {
+                                if(isr & (1<<irq)) {
+                                        printk("\tCPU%d: ack irq %d\n",
+                                               cpu, irq);
+                                        local_irq_save(flags);
+                                        outb(VIC_CPU_MASQUERADE_ENABLE | cpu,
+                                             VIC_PROCESSOR_ID);
+                                        ack_vic_irq(irq);
+                                        outb(old_cpu, VIC_PROCESSOR_ID);
+                                        local_irq_restore(flags);
+                                }
+                        }
+                }
+#endif
+        }
+}
+void
+smp_voyager_power_off(void *dummy)
+{
+        if(smp_processor_id() == boot_cpu_id) 
+                voyager_power_off();
+        else
+                smp_stop_cpu_function(NULL);
+}
+static void __init
+voyager_smp_prepare_cpus(unsigned int max_cpus)
+{
+        /* FIXME: ignore max_cpus for now */
+        smp_boot_cpus();
+}
+static void __devinit voyager_smp_prepare_boot_cpu(void)
+{
+        init_gdt(smp_processor_id());
+        switch_to_new_gdt();
+        cpu_set(smp_processor_id(), cpu_online_map);
+        cpu_set(smp_processor_id(), cpu_callout_map);
+        cpu_set(smp_processor_id(), cpu_possible_map);
+        cpu_set(smp_processor_id(), cpu_present_map);
+}
+static int __devinit
+voyager_cpu_up(unsigned int cpu)
+{
+        /* This only works at boot for x86.  See "rewrite" above. */
+        if (cpu_isset(cpu, smp_commenced_mask))
+                return -ENOSYS;
+        /* In case one didn't come up */
+        if (!cpu_isset(cpu, cpu_callin_map))
+                return -EIO;
+        /* Unleash the CPU! */
+        cpu_set(cpu, smp_commenced_mask);
+        while (!cpu_isset(cpu, cpu_online_map))
+                mb();
+        return 0;
+}
+static void __init
+voyager_smp_cpus_done(unsigned int max_cpus)
+{
+        zap_low_mappings();
+}
+void __init
+smp_setup_processor_id(void)
+{
+        current_thread_info()->cpu = hard_smp_processor_id();
+        x86_write_percpu(cpu_number, hard_smp_processor_id());
+}
+struct smp_ops smp_ops = {
+        .smp_prepare_boot_cpu = voyager_smp_prepare_boot_cpu,
+        .smp_prepare_cpus = voyager_smp_prepare_cpus,
+        .cpu_up = voyager_cpu_up,
+        .smp_cpus_done = voyager_smp_cpus_done,
+        .smp_send_stop = voyager_smp_send_stop,
+        .smp_send_reschedule = voyager_smp_send_reschedule,
+        .smp_call_function_mask = voyager_smp_call_function_mask,
+};
diff --git a/arch/x86/mach-voyager/voyager_thread.c b/arch/x86/mach-voyager/voyager_thread.c
new file mode 100644
index 000000000000..f9d595338159
--- /dev/null
+++ b/arch/x86/mach-voyager/voyager_thread.c
@@ -0,0 +1,134 @@
+/* -*- mode: c; c-basic-offset: 8 -*- */
+/* Copyright (C) 2001
+ *
+ * Author: J.E.J.Bottomley@HansenPartnership.com
+ *
+ * linux/arch/i386/kernel/voyager_thread.c
+ *
+ * This module provides the machine status monitor thread for the
+ * voyager architecture.  This allows us to monitor the machine
+ * environment (temp, voltage, fan function) and the front panel and
+ * internal UPS.  If a fault is detected, this thread takes corrective
+ * action (usually just informing init)
+ * */
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/kmod.h>
+#include <linux/completion.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <asm/desc.h>
+#include <asm/voyager.h>
+#include <asm/vic.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+struct task_struct *voyager_thread;
+static __u8 set_timeout;
+static int
+execute(const char *string)
+{
+        int ret;
+        char *envp[] = {
+                "HOME=/",
+                "TERM=linux",
+                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+                NULL,
+        };
+        char *argv[] = {
+                "/bin/bash",
+                "-c",
+                (char *)string,
+                NULL,
+        };
+        if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
+                printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
+                       string, ret);
+        }
+        return ret;
+}
+static void
+check_from_kernel(void)
+{
+        if(voyager_status.switch_off) {
+                
+                /* FIXME: This should be configureable via proc */
+                execute("umask 600; echo 0 > /etc/initrunlvl; kill -HUP 1");
+        } else if(voyager_status.power_fail) {
+                VDEBUG(("Voyager daemon detected AC power failure\n"));
+                
+                /* FIXME: This should be configureable via proc */
+                execute("umask 600; echo F > /etc/powerstatus; kill -PWR 1");
+                set_timeout = 1;
+        }
+}
+static void
+check_continuing_condition(void)
+{
+        if(voyager_status.power_fail) {
+                __u8 data;
+                voyager_cat_psi(VOYAGER_PSI_SUBREAD, 
+                                VOYAGER_PSI_AC_FAIL_REG, &data);
+                if((data & 0x1f) == 0) {
+                        /* all power restored */
+                        printk(KERN_NOTICE "VOYAGER AC power restored, cancelling shutdown\n");
+                        /* FIXME: should be user configureable */
+                        execute("umask 600; echo O > /etc/powerstatus; kill -PWR 1");
+                        set_timeout = 0;
+                }
+        }
+}
+static int
+thread(void *unused)
+{
+        printk(KERN_NOTICE "Voyager starting monitor thread\n");
+        for (;;) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(set_timeout ? HZ : MAX_SCHEDULE_TIMEOUT);
+                VDEBUG(("Voyager Daemon awoken\n"));
+                if(voyager_status.request_from_kernel == 0) {
+                        /* probably awoken from timeout */
+                        check_continuing_condition();
+                } else {
+                        check_from_kernel();
+                        voyager_status.request_from_kernel = 0;
+                }
+        }
+}
+static int __init
+voyager_thread_start(void)
+{
+        voyager_thread = kthread_run(thread, NULL, "kvoyagerd");
+        if (IS_ERR(voyager_thread)) {
+                printk(KERN_ERR "Voyager: Failed to create system monitor thread.\n");
+                return PTR_ERR(voyager_thread);
+        }
+        return 0;
+}
+static void __exit
+voyager_thread_stop(void)
+{
+        kthread_stop(voyager_thread);
+}
+module_init(voyager_thread_start);
+module_exit(voyager_thread_stop);
diff --git a/arch/x86/math-emu/Makefile b/arch/x86/math-emu/Makefile
new file mode 100644
index 000000000000..9c943fa6ce6b
--- /dev/null
+++ b/arch/x86/math-emu/Makefile
@@ -0,0 +1,30 @@
+#
+#               Makefile for wm-FPU-emu
+#
+#DEBUG  = -DDEBUGGING
+DEBUG   =
+PARANOID = -DPARANOID
+CFLAGS  := $(CFLAGS) $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION)
+EXTRA_AFLAGS    := $(PARANOID)
+# From 'C' language sources:
+C_OBJS =fpu_entry.o errors.o \
+        fpu_arith.o fpu_aux.o fpu_etc.o fpu_tags.o fpu_trig.o \
+        load_store.o get_address.o \
+        poly_atan.o poly_l2.o poly_2xm1.o poly_sin.o poly_tan.o \
+        reg_add_sub.o reg_compare.o reg_constant.o reg_convert.o \
+        reg_ld_str.o reg_divide.o reg_mul.o
+# From 80x86 assembler sources:
+A_OBJS =reg_u_add.o reg_u_div.o reg_u_mul.o reg_u_sub.o \
+        div_small.o reg_norm.o reg_round.o \
+        wm_shrx.o wm_sqrt.o \
+        div_Xsig.o polynom_Xsig.o round_Xsig.o \
+        shr_Xsig.o mul_Xsig.o
+obj-y =$(C_OBJS) $(A_OBJS)
+proto:
+        cproto -e -DMAKING_PROTO *.c >fpu_proto.h
diff --git a/arch/x86/math-emu/README b/arch/x86/math-emu/README
new file mode 100644
index 000000000000..e6235491d6eb
--- /dev/null
+++ b/arch/x86/math-emu/README
@@ -0,0 +1,427 @@
+ +---------------------------------------------------------------------------+
+ |  wm-FPU-emu   an FPU emulator for 80386 and 80486SX microprocessors.      |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1995,1996,1997,1999                          |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail billm@melbpc.org.au              |
+ |                                                                           |
+ |    This program is free software; you can redistribute it and/or modify   |
+ |    it under the terms of the GNU General Public License version 2 as      |
+ |    published by the Free Software Foundation.                             |
+ |                                                                           |
+ |    This program is distributed in the hope that it will be useful,        |
+ |    but WITHOUT ANY WARRANTY; without even the implied warranty of         |
+ |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          |
+ |    GNU General Public License for more details.                           |
+ |                                                                           |
+ |    You should have received a copy of the GNU General Public License      |
+ |    along with this program; if not, write to the Free Software            |
+ |    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.              |
+ |                                                                           |
+ +---------------------------------------------------------------------------+
+wm-FPU-emu is an FPU emulator for Linux. It is derived from wm-emu387
+which was my 80387 emulator for early versions of djgpp (gcc under
+msdos); wm-emu387 was in turn based upon emu387 which was written by
+DJ Delorie for djgpp.  The interface to the Linux kernel is based upon
+the original Linux math emulator by Linus Torvalds.
+My target FPU for wm-FPU-emu is that described in the Intel486
+Programmer's Reference Manual (1992 edition). Unfortunately, numerous
+facets of the functioning of the FPU are not well covered in the
+Reference Manual. The information in the manual has been supplemented
+with measurements on real 80486's. Unfortunately, it is simply not
+possible to be sure that all of the peculiarities of the 80486 have
+been discovered, so there is always likely to be obscure differences
+in the detailed behaviour of the emulator and a real 80486.
+wm-FPU-emu does not implement all of the behaviour of the 80486 FPU,
+but is very close.  See "Limitations" later in this file for a list of
+some differences.
+Please report bugs, etc to me at:
+       billm@melbpc.org.au
+or     b.metzenthen@medoto.unimelb.edu.au
+For more information on the emulator and on floating point topics, see
+my web pages, currently at  http://www.suburbia.net/~billm/
+--Bill Metzenthen
+  December 1999
+----------------------- Internals of wm-FPU-emu -----------------------
+Numeric algorithms:
+(1) Add, subtract, and multiply. Nothing remarkable in these.
+(2) Divide has been tuned to get reasonable performance. The algorithm
+    is not the obvious one which most people seem to use, but is designed
+    to take advantage of the characteristics of the 80386. I expect that
+    it has been invented many times before I discovered it, but I have not
+    seen it. It is based upon one of those ideas which one carries around
+    for years without ever bothering to check it out.
+(3) The sqrt function has been tuned to get good performance. It is based
+    upon Newton's classic method. Performance was improved by capitalizing
+    upon the properties of Newton's method, and the code is once again
+    structured taking account of the 80386 characteristics.
+(4) The trig, log, and exp functions are based in each case upon quasi-
+    "optimal" polynomial approximations. My definition of "optimal" was
+    based upon getting good accuracy with reasonable speed.
+(5) The argument reducing code for the trig function effectively uses
+    a value of pi which is accurate to more than 128 bits. As a consequence,
+    the reduced argument is accurate to more than 64 bits for arguments up
+    to a few pi, and accurate to more than 64 bits for most arguments,
+    even for arguments approaching 2^63. This is far superior to an
+    80486, which uses a value of pi which is accurate to 66 bits.
+The code of the emulator is complicated slightly by the need to
+account for a limited form of re-entrancy. Normally, the emulator will
+emulate each FPU instruction to completion without interruption.
+However, it may happen that when the emulator is accessing the user
+memory space, swapping may be needed. In this case the emulator may be
+temporarily suspended while disk i/o takes place. During this time
+another process may use the emulator, thereby perhaps changing static
+variables. The code which accesses user memory is confined to five
+files:
+    fpu_entry.c
+    reg_ld_str.c
+    load_store.c
+    get_address.c
+    errors.c
+As from version 1.12 of the emulator, no static variables are used
+(apart from those in the kernel's per-process tables). The emulator is
+therefore now fully re-entrant, rather than having just the restricted
+form of re-entrancy which is required by the Linux kernel.
+----------------------- Limitations of wm-FPU-emu -----------------------
+There are a number of differences between the current wm-FPU-emu
+(version 2.01) and the 80486 FPU (apart from bugs).  The differences
+are fewer than those which applied to the 1.xx series of the emulator.
+Some of the more important differences are listed below:
+The Roundup flag does not have much meaning for the transcendental
+functions and its 80486 value with these functions is likely to differ
+from its emulator value.
+In a few rare cases the Underflow flag obtained with the emulator will
+be different from that obtained with an 80486. This occurs when the
+following conditions apply simultaneously:
+(a) the operands have a higher precision than the current setting of the
+    precision control (PC) flags.
+(b) the underflow exception is masked.
+(c) the magnitude of the exact result (before rounding) is less than 2^-16382.
+(d) the magnitude of the final result (after rounding) is exactly 2^-16382.
+(e) the magnitude of the exact result would be exactly 2^-16382 if the
+    operands were rounded to the current precision before the arithmetic
+    operation was performed.
+If all of these apply, the emulator will set the Underflow flag but a real
+80486 will not.
+NOTE: Certain formats of Extended Real are UNSUPPORTED. They are
+unsupported by the 80486. They are the Pseudo-NaNs, Pseudoinfinities,
+and Unnormals. None of these will be generated by an 80486 or by the
+emulator. Do not use them. The emulator treats them differently in
+detail from the way an 80486 does.
+Self modifying code can cause the emulator to fail. An example of such
+code is:
+          movl %esp,[%ebx]
+          fld1
+The FPU instruction may be (usually will be) loaded into the pre-fetch
+queue of the CPU before the mov instruction is executed. If the
+destination of the 'movl' overlaps the FPU instruction then the bytes
+in the prefetch queue and memory will be inconsistent when the FPU
+instruction is executed. The emulator will be invoked but will not be
+able to find the instruction which caused the device-not-present
+exception. For this case, the emulator cannot emulate the behaviour of
+an 80486DX.
+Handling of the address size override prefix byte (0x67) has not been
+extensively tested yet. A major problem exists because using it in
+vm86 mode can cause a general protection fault. Address offsets
+greater than 0xffff appear to be illegal in vm86 mode but are quite
+acceptable (and work) in real mode. A small test program developed to
+check the addressing, and which runs successfully in real mode,
+crashes dosemu under Linux and also brings Windows down with a general
+protection fault message when run under the MS-DOS prompt of Windows
+3.1. (The program simply reads data from a valid address).
+The emulator supports 16-bit protected mode, with one difference from
+an 80486DX.  A 80486DX will allow some floating point instructions to
+write a few bytes below the lowest address of the stack.  The emulator
+will not allow this in 16-bit protected mode: no instructions are
+allowed to write outside the bounds set by the protection.
+----------------------- Performance of wm-FPU-emu -----------------------
+Speed.
+-----
+The speed of floating point computation with the emulator will depend
+upon instruction mix. Relative performance is best for the instructions
+which require most computation. The simple instructions are adversely
+affected by the FPU instruction trap overhead.
+Timing: Some simple timing tests have been made on the emulator functions.
+The times include load/store instructions. All times are in microseconds
+measured on a 33MHz 386 with 64k cache. The Turbo C tests were under
+ms-dos, the next two columns are for emulators running with the djgpp
+ms-dos extender. The final column is for wm-FPU-emu in Linux 0.97,
+using libm4.0 (hard).
+function      Turbo C        djgpp 1.06        WM-emu387     wm-FPU-emu
+   +          60.5           154.8              76.5          139.4
+   -          61.1-65.5      157.3-160.8        76.2-79.5     142.9-144.7
+   *          71.0           190.8              79.6          146.6
+   /          61.2-75.0      261.4-266.9        75.3-91.6     142.2-158.1
+ sin()        310.8          4692.0            319.0          398.5
+ cos()        284.4          4855.2            308.0          388.7
+ tan()        495.0          8807.1            394.9          504.7
+ atan()       328.9          4866.4            601.1          419.5-491.9
+ sqrt()       128.7          crashed           145.2          227.0
+ log()        413.1-419.1    5103.4-5354.21    254.7-282.2    409.4-437.1
+ exp()        479.1          6619.2            469.1          850.8
+The performance under Linux is improved by the use of look-ahead code.
+The following results show the improvement which is obtained under
+Linux due to the look-ahead code. Also given are the times for the
+original Linux emulator with the 4.1 'soft' lib.
+ [ Linus' note: I changed look-ahead to be the default under linux, as
+   there was no reason not to use it after I had edited it to be
+   disabled during tracing ]
+            wm-FPU-emu w     original w
+            look-ahead       'soft' lib
+   +         106.4             190.2
+   -         108.6-111.6      192.4-216.2
+   *         113.4             193.1
+   /         108.8-124.4      700.1-706.2
+ sin()       390.5            2642.0
+ cos()       381.5            2767.4
+ tan()       496.5            3153.3
+ atan()      367.2-435.5     2439.4-3396.8
+ sqrt()      195.1            4732.5
+ log()       358.0-387.5     3359.2-3390.3
+ exp()       619.3            4046.4
+These figures are now somewhat out-of-date. The emulator has become
+progressively slower for most functions as more of the 80486 features
+have been implemented.
+----------------------- Accuracy of wm-FPU-emu -----------------------
+The accuracy of the emulator is in almost all cases equal to or better
+than that of an Intel 80486 FPU.
+The results of the basic arithmetic functions (+,-,*,/), and fsqrt
+match those of an 80486 FPU. They are the best possible; the error for
+these never exceeds 1/2 an lsb. The fprem and fprem1 instructions
+return exact results; they have no error.
+The following table compares the emulator accuracy for the sqrt(),
+trig and log functions against the Turbo C "emulator". For this table,
+each function was tested at about 400 points. Ideal worst-case results
+would be 64 bits. The reduced Turbo C accuracy of cos() and tan() for
+arguments greater than pi/4 can be thought of as being related to the
+precision of the argument x; e.g. an argument of pi/2-(1e-10) which is
+accurate to 64 bits can result in a relative accuracy in cos() of
+about 64 + log2(cos(x)) = 31 bits.
+Function      Tested x range            Worst result                Turbo C
+                                        (relative bits)
+sqrt(x)       1 .. 2                    64.1                         63.2
+atan(x)       1e-10 .. 200              64.2                         62.8
+cos(x)        0 .. pi/2-(1e-10)         64.4 (x <= pi/4)             62.4
+                                        64.1 (x = pi/2-(1e-10))      31.9
+sin(x)        1e-10 .. pi/2             64.0                         62.8
+tan(x)        1e-10 .. pi/2-(1e-10)     64.0 (x <= pi/4)             62.1
+                                        64.1 (x = pi/2-(1e-10))      31.9
+exp(x)        0 .. 1                    63.1 **                      62.9
+log(x)        1+1e-6 .. 2               63.8 **                      62.1
+** The accuracy for exp() and log() is low because the FPU (emulator)
+does not compute them directly; two operations are required.
+The emulator passes the "paranoia" tests (compiled with gcc 2.3.3 or
+later) for 'float' variables (24 bit precision numbers) when precision
+control is set to 24, 53 or 64 bits, and for 'double' variables (53
+bit precision numbers) when precision control is set to 53 bits (a
+properly performing FPU cannot pass the 'paranoia' tests for 'double'
+variables when precision control is set to 64 bits).
+The code for reducing the argument for the trig functions (fsin, fcos,
+fptan and fsincos) has been improved and now effectively uses a value
+for pi which is accurate to more than 128 bits precision. As a
+consequence, the accuracy of these functions for large arguments has
+been dramatically improved (and is now very much better than an 80486
+FPU). There is also now no degradation of accuracy for fcos and fptan
+for operands close to pi/2. Measured results are (note that the
+definition of accuracy has changed slightly from that used for the
+above table):
+Function      Tested x range          Worst result
+                                     (absolute bits)
+cos(x)        0 .. 9.22e+18              62.0
+sin(x)        1e-16 .. 9.22e+18          62.1
+tan(x)        1e-16 .. 9.22e+18          61.8
+It is possible with some effort to find very large arguments which
+give much degraded precision. For example, the integer number
+           8227740058411162616.0
+is within about 10e-7 of a multiple of pi. To find the tan (for
+example) of this number to 64 bits precision it would be necessary to
+have a value of pi which had about 150 bits precision. The FPU
+emulator computes the result to about 42.6 bits precision (the correct
+result is about -9.739715e-8). On the other hand, an 80486 FPU returns
+0.01059, which in relative terms is hopelessly inaccurate.
+For arguments close to critical angles (which occur at multiples of
+pi/2) the emulator is more accurate than an 80486 FPU. For very large
+arguments, the emulator is far more accurate.
+Prior to version 1.20 of the emulator, the accuracy of the results for
+the transcendental functions (in their principal range) was not as
+good as the results from an 80486 FPU. From version 1.20, the accuracy
+has been considerably improved and these functions now give measured
+worst-case results which are better than the worst-case results given
+by an 80486 FPU.
+The following table gives the measured results for the emulator. The
+number of randomly selected arguments in each case is about half a
+million.  The group of three columns gives the frequency of the given
+accuracy in number of times per million, thus the second of these
+columns shows that an accuracy of between 63.80 and 63.89 bits was
+found at a rate of 133 times per one million measurements for fsin.
+The results show that the fsin, fcos and fptan instructions return
+results which are in error (i.e. less accurate than the best possible
+result (which is 64 bits)) for about one per cent of all arguments
+between -pi/2 and +pi/2.  The other instructions have a lower
+frequency of results which are in error.  The last two columns give
+the worst accuracy which was found (in bits) and the approximate value
+of the argument which produced it.
+                                frequency (per M)
+                               -------------------   ---------------
+instr   arg range    # tests   63.7   63.8    63.9   worst   at arg
+                               bits   bits    bits    bits
+-----  ------------  -------   ----   ----   -----   -----  --------
+fsin     (0,pi/2)     547756      0    133   10673   63.89  0.451317
+fcos     (0,pi/2)     547563      0    126   10532   63.85  0.700801
+fptan    (0,pi/2)     536274     11    267   10059   63.74  0.784876
+fpatan  4 quadrants   517087      0      8    1855   63.88  0.435121 (4q)
+fyl2x     (0,20)      541861      0      0    1323   63.94  1.40923  (x)
+fyl2xp1 (-.293,.414)  520256      0      0    5678   63.93  0.408542 (x)
+f2xm1     (-1,1)      538847      4    481    6488   63.79  0.167709
+Tests performed on an 80486 FPU showed results of lower accuracy. The
+following table gives the results which were obtained with an AMD
+486DX2/66 (other tests indicate that an Intel 486DX produces
+identical results).  The tests were basically the same as those used
+to measure the emulator (the values, being random, were in general not
+the same).  The total number of tests for each instruction are given
+at the end of the table, in case each about 100k tests were performed.
+Another line of figures at the end of the table shows that most of the
+instructions return results which are in error for more than 10
+percent of the arguments tested.
+The numbers in the body of the table give the approx number of times a
+result of the given accuracy in bits (given in the left-most column)
+was obtained per one million arguments. For three of the instructions,
+two columns of results are given: * The second column for f2xm1 gives
+the number cases where the results of the first column were for a
+positive argument, this shows that this instruction gives better
+results for positive arguments than it does for negative.  * In the
+cases of fcos and fptan, the first column gives the results when all
+cases where arguments greater than 1.5 were removed from the results
+given in the second column. Unlike the emulator, an 80486 FPU returns
+results of relatively poor accuracy for these instructions when the
+argument approaches pi/2. The table does not show those cases when the
+accuracy of the results were less than 62 bits, which occurs quite
+often for fsin and fptan when the argument approaches pi/2. This poor
+accuracy is discussed above in relation to the Turbo C "emulator", and
+the accuracy of the value of pi.
+bits   f2xm1  f2xm1 fpatan   fcos   fcos  fyl2x fyl2xp1  fsin  fptan  fptan
+62.0       0      0      0      0    437      0      0      0      0    925
+62.1       0      0     10      0    894      0      0      0      0   1023
+62.2      14      0      0      0   1033      0      0      0      0    945
+62.3      57      0      0      0   1202      0      0      0      0   1023
+62.4     385      0      0     10   1292      0     23      0      0   1178
+62.5    1140      0      0    119   1649      0     39      0      0   1149
+62.6    2037      0      0    189   1620      0     16      0      0   1169
+62.7    5086     14      0    646   2315     10    101     35     39   1402
+62.8    8818     86      0    984   3050     59    287    131    224   2036
+62.9   11340   1355      0   2126   4153     79    605    357    321   1948
+63.0   15557   4750      0   3319   5376    246   1281    862    808   2688
+63.1   20016   8288      0   4620   6628    511   2569   1723   1510   3302
+63.2   24945  11127     10   6588   8098   1120   4470   2968   2990   4724
+63.3   25686  12382     69   8774  10682   1906   6775   4482   5474   7236
+63.4   29219  14722     79  11109  12311   3094   9414   7259   8912  10587
+63.5   30458  14936    393  13802  15014   5874  12666   9609  13762  15262
+63.6   32439  16448   1277  17945  19028  10226  15537  14657  19158  20346
+63.7   35031  16805   4067  23003  23947  18910  20116  21333  25001  26209
+63.8   33251  15820   7673  24781  25675  24617  25354  24440  29433  30329
+63.9   33293  16833  18529  28318  29233  31267  31470  27748  29676  30601
+Per cent with error:
+        30.9           3.2          18.5    9.8   13.1   11.6          17.4
+Total arguments tested:
+       70194  70099 101784 100641 100641 101799 128853 114893 102675 102675
+------------------------- Contributors -------------------------------
+A number of people have contributed to the development of the
+emulator, often by just reporting bugs, sometimes with suggested
+fixes, and a few kind people have provided me with access in one way
+or another to an 80486 machine. Contributors include (to those people
+who I may have forgotten, please forgive me):
+Linus Torvalds
+Tommy.Thorn@daimi.aau.dk
+Andrew.Tridgell@anu.edu.au
+Nick Holloway, alfie@dcs.warwick.ac.uk
+Hermano Moura, moura@dcs.gla.ac.uk
+Jon Jagger, J.Jagger@scp.ac.uk
+Lennart Benschop
+Brian Gallew, geek+@CMU.EDU
+Thomas Staniszewski, ts3v+@andrew.cmu.edu
+Martin Howell, mph@plasma.apana.org.au
+M Saggaf, alsaggaf@athena.mit.edu
+Peter Barker, PETER@socpsy.sci.fau.edu
+tom@vlsivie.tuwien.ac.at
+Dan Russel, russed@rpi.edu
+Daniel Carosone, danielce@ee.mu.oz.au
+cae@jpmorgan.com
+Hamish Coleman, t933093@minyos.xx.rmit.oz.au
+Bruce Evans, bde@kralizec.zeta.org.au
+Timo Korvola, Timo.Korvola@hut.fi
+Rick Lyons, rick@razorback.brisnet.org.au
+Rick, jrs@world.std.com
+ 
+...and numerous others who responded to my request for help with
+a real 80486.
diff --git a/arch/x86/math-emu/control_w.h b/arch/x86/math-emu/control_w.h
new file mode 100644
index 000000000000..ae2274dbd305
--- /dev/null
+++ b/arch/x86/math-emu/control_w.h
@@ -0,0 +1,45 @@
+/*---------------------------------------------------------------------------+
+ |  control_w.h                                                              |
+ |                                                                           |
+ | Copyright (C) 1992,1993                                                   |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail   billm@vaxc.cc.monash.edu.au    |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#ifndef _CONTROLW_H_
+#define _CONTROLW_H_
+#ifdef __ASSEMBLY__
+#define _Const_(x)      $##x
+#else
+#define _Const_(x)      x
+#endif
+#define CW_RC           _Const_(0x0C00) /* rounding control */
+#define CW_PC           _Const_(0x0300) /* precision control */
+#define CW_Precision    Const_(0x0020)  /* loss of precision mask */
+#define CW_Underflow    Const_(0x0010)  /* underflow mask */
+#define CW_Overflow     Const_(0x0008)  /* overflow mask */
+#define CW_ZeroDiv      Const_(0x0004)  /* divide by zero mask */
+#define CW_Denormal     Const_(0x0002)  /* denormalized operand mask */
+#define CW_Invalid      Const_(0x0001)  /* invalid operation mask */
+#define CW_Exceptions   _Const_(0x003f) /* all masks */
+#define RC_RND          _Const_(0x0000)
+#define RC_DOWN         _Const_(0x0400)
+#define RC_UP           _Const_(0x0800)
+#define RC_CHOP         _Const_(0x0C00)
+/* p 15-5: Precision control bits affect only the following:
+   ADD, SUB(R), MUL, DIV(R), and SQRT */
+#define PR_24_BITS        _Const_(0x000)
+#define PR_53_BITS        _Const_(0x200)
+#define PR_64_BITS        _Const_(0x300)
+#define PR_RESERVED_BITS  _Const_(0x100)
+/* FULL_PRECISION simulates all exceptions masked */
+#define FULL_PRECISION  (PR_64_BITS | RC_RND | 0x3f)
+#endif /* _CONTROLW_H_ */
diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S
new file mode 100644
index 000000000000..f77ba3058b31
--- /dev/null
+++ b/arch/x86/math-emu/div_Xsig.S
@@ -0,0 +1,365 @@
+        .file   "div_Xsig.S"
+/*---------------------------------------------------------------------------+
+ |  div_Xsig.S                                                               |
+ |                                                                           |
+ | Division subroutine for 96 bit quantities                                 |
+ |                                                                           |
+ | Copyright (C) 1994,1995                                                   |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail billm@jacobi.maths.monash.edu.au |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ | Divide the 96 bit quantity pointed to by a, by that pointed to by b, and  |
+ | put the 96 bit result at the location d.                                  |
+ |                                                                           |
+ | The result may not be accurate to 96 bits. It is intended for use where   |
+ | a result better than 64 bits is required. The result should usually be    |
+ | good to at least 94 bits.                                                 |
+ | The returned result is actually divided by one half. This is done to      |
+ | prevent overflow.                                                         |
+ |                                                                           |
+ |  .aaaaaaaaaaaaaa / .bbbbbbbbbbbbb  ->  .dddddddddddd                      |
+ |                                                                           |
+ |  void div_Xsig(Xsig *a, Xsig *b, Xsig *dest)                              |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "exception.h"
+#include "fpu_emu.h"
+#define XsigLL(x)       (x)
+#define XsigL(x)        4(x)
+#define XsigH(x)        8(x)
+#ifndef NON_REENTRANT_FPU
+/*
+        Local storage on the stack:
+        Accumulator:    FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0
+ */
+#define FPU_accum_3     -4(%ebp)
+#define FPU_accum_2     -8(%ebp)
+#define FPU_accum_1     -12(%ebp)
+#define FPU_accum_0     -16(%ebp)
+#define FPU_result_3    -20(%ebp)
+#define FPU_result_2    -24(%ebp)
+#define FPU_result_1    -28(%ebp)
+#else
+.data
+/*
+        Local storage in a static area:
+        Accumulator:    FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0
+ */
+        .align 4,0
+FPU_accum_3:
+        .long   0
+FPU_accum_2:
+        .long   0
+FPU_accum_1:
+        .long   0
+FPU_accum_0:
+        .long   0
+FPU_result_3:
+        .long   0
+FPU_result_2:
+        .long   0
+FPU_result_1:
+        .long   0
+#endif /* NON_REENTRANT_FPU */
+.text
+ENTRY(div_Xsig)
+        pushl   %ebp
+        movl    %esp,%ebp
+#ifndef NON_REENTRANT_FPU
+        subl    $28,%esp
+#endif /* NON_REENTRANT_FPU */ 
+        pushl   %esi
+        pushl   %edi
+        pushl   %ebx
+        movl    PARAM1,%esi     /* pointer to num */
+        movl    PARAM2,%ebx     /* pointer to denom */
+#ifdef PARANOID
+        testl   $0x80000000, XsigH(%ebx)        /* Divisor */
+        je      L_bugged
+#endif /* PARANOID */
+/*---------------------------------------------------------------------------+
+ |  Divide:   Return  arg1/arg2 to arg3.                                     |
+ |                                                                           |
+ |  The maximum returned value is (ignoring exponents)                       |
+ |               .ffffffff ffffffff                                          |
+ |               ------------------  =  1.ffffffff fffffffe                  |
+ |               .80000000 00000000                                          |
+ | and the minimum is                                                        |
+ |               .80000000 00000000                                          |
+ |               ------------------  =  .80000000 00000001   (rounded)       |
+ |               .ffffffff ffffffff                                          |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+        /* Save extended dividend in local register */
+        /* Divide by 2 to prevent overflow */
+        clc
+        movl    XsigH(%esi),%eax
+        rcrl    %eax
+        movl    %eax,FPU_accum_3
+        movl    XsigL(%esi),%eax
+        rcrl    %eax
+        movl    %eax,FPU_accum_2
+        movl    XsigLL(%esi),%eax
+        rcrl    %eax
+        movl    %eax,FPU_accum_1
+        movl    $0,%eax
+        rcrl    %eax
+        movl    %eax,FPU_accum_0
+        movl    FPU_accum_2,%eax        /* Get the current num */
+        movl    FPU_accum_3,%edx
+/*----------------------------------------------------------------------*/
+/* Initialization done.
+   Do the first 32 bits. */
+        /* We will divide by a number which is too large */
+        movl    XsigH(%ebx),%ecx
+        addl    $1,%ecx
+        jnc     LFirst_div_not_1
+        /* here we need to divide by 100000000h,
+           i.e., no division at all.. */
+        mov     %edx,%eax
+        jmp     LFirst_div_done
+LFirst_div_not_1:
+        divl    %ecx            /* Divide the numerator by the augmented
+                                   denom ms dw */
+LFirst_div_done:
+        movl    %eax,FPU_result_3       /* Put the result in the answer */
+        mull    XsigH(%ebx)     /* mul by the ms dw of the denom */
+        subl    %eax,FPU_accum_2        /* Subtract from the num local reg */
+        sbbl    %edx,FPU_accum_3
+        movl    FPU_result_3,%eax       /* Get the result back */
+        mull    XsigL(%ebx)     /* now mul the ls dw of the denom */
+        subl    %eax,FPU_accum_1        /* Subtract from the num local reg */
+        sbbl    %edx,FPU_accum_2
+        sbbl    $0,FPU_accum_3
+        je      LDo_2nd_32_bits         /* Must check for non-zero result here */
+#ifdef PARANOID
+        jb      L_bugged_1
+#endif /* PARANOID */ 
+        /* need to subtract another once of the denom */
+        incl    FPU_result_3    /* Correct the answer */
+        movl    XsigL(%ebx),%eax
+        movl    XsigH(%ebx),%edx
+        subl    %eax,FPU_accum_1        /* Subtract from the num local reg */
+        sbbl    %edx,FPU_accum_2
+#ifdef PARANOID
+        sbbl    $0,FPU_accum_3
+        jne     L_bugged_1      /* Must check for non-zero result here */
+#endif /* PARANOID */ 
+/*----------------------------------------------------------------------*/
+/* Half of the main problem is done, there is just a reduced numerator
+   to handle now.
+   Work with the second 32 bits, FPU_accum_0 not used from now on */
+LDo_2nd_32_bits:
+        movl    FPU_accum_2,%edx        /* get the reduced num */
+        movl    FPU_accum_1,%eax
+        /* need to check for possible subsequent overflow */
+        cmpl    XsigH(%ebx),%edx
+        jb      LDo_2nd_div
+        ja      LPrevent_2nd_overflow
+        cmpl    XsigL(%ebx),%eax
+        jb      LDo_2nd_div
+LPrevent_2nd_overflow:
+/* The numerator is greater or equal, would cause overflow */
+        /* prevent overflow */
+        subl    XsigL(%ebx),%eax
+        sbbl    XsigH(%ebx),%edx
+        movl    %edx,FPU_accum_2
+        movl    %eax,FPU_accum_1
+        incl    FPU_result_3    /* Reflect the subtraction in the answer */
+#ifdef PARANOID
+        je      L_bugged_2      /* Can't bump the result to 1.0 */
+#endif /* PARANOID */ 
+LDo_2nd_div:
+        cmpl    $0,%ecx         /* augmented denom msw */
+        jnz     LSecond_div_not_1
+        /* %ecx == 0, we are dividing by 1.0 */
+        mov     %edx,%eax
+        jmp     LSecond_div_done
+LSecond_div_not_1:
+        divl    %ecx            /* Divide the numerator by the denom ms dw */
+LSecond_div_done:
+        movl    %eax,FPU_result_2       /* Put the result in the answer */
+        mull    XsigH(%ebx)     /* mul by the ms dw of the denom */
+        subl    %eax,FPU_accum_1        /* Subtract from the num local reg */
+        sbbl    %edx,FPU_accum_2
+#ifdef PARANOID
+        jc      L_bugged_2
+#endif /* PARANOID */
+        movl    FPU_result_2,%eax       /* Get the result back */
+        mull    XsigL(%ebx)     /* now mul the ls dw of the denom */
+        subl    %eax,FPU_accum_0        /* Subtract from the num local reg */
+        sbbl    %edx,FPU_accum_1        /* Subtract from the num local reg */
+        sbbl    $0,FPU_accum_2
+#ifdef PARANOID
+        jc      L_bugged_2
+#endif /* PARANOID */
+        jz      LDo_3rd_32_bits
+#ifdef PARANOID
+        cmpl    $1,FPU_accum_2
+        jne     L_bugged_2
+#endif /* PARANOID */ 
+        /* need to subtract another once of the denom */
+        movl    XsigL(%ebx),%eax
+        movl    XsigH(%ebx),%edx
+        subl    %eax,FPU_accum_0        /* Subtract from the num local reg */
+        sbbl    %edx,FPU_accum_1
+        sbbl    $0,FPU_accum_2
+#ifdef PARANOID
+        jc      L_bugged_2
+        jne     L_bugged_2
+#endif /* PARANOID */ 
+        addl    $1,FPU_result_2 /* Correct the answer */
+        adcl    $0,FPU_result_3
+#ifdef PARANOID
+        jc      L_bugged_2      /* Must check for non-zero result here */
+#endif /* PARANOID */ 
+/*----------------------------------------------------------------------*/
+/* The division is essentially finished here, we just need to perform
+   tidying operations.
+   Deal with the 3rd 32 bits */
+LDo_3rd_32_bits:
+        /* We use an approximation for the third 32 bits.
+        To take account of the 3rd 32 bits of the divisor
+        (call them del), we subtract  del * (a/b) */
+        movl    FPU_result_3,%eax       /* a/b */
+        mull    XsigLL(%ebx)            /* del */
+        subl    %edx,FPU_accum_1
+        /* A borrow indicates that the result is negative */
+        jnb     LTest_over
+        movl    XsigH(%ebx),%edx
+        addl    %edx,FPU_accum_1
+        subl    $1,FPU_result_2         /* Adjust the answer */
+        sbbl    $0,FPU_result_3
+        /* The above addition might not have been enough, check again. */
+        movl    FPU_accum_1,%edx        /* get the reduced num */
+        cmpl    XsigH(%ebx),%edx        /* denom */
+        jb      LDo_3rd_div
+        movl    XsigH(%ebx),%edx
+        addl    %edx,FPU_accum_1
+        subl    $1,FPU_result_2         /* Adjust the answer */
+        sbbl    $0,FPU_result_3
+        jmp     LDo_3rd_div
+LTest_over:
+        movl    FPU_accum_1,%edx        /* get the reduced num */
+        /* need to check for possible subsequent overflow */
+        cmpl    XsigH(%ebx),%edx        /* denom */
+        jb      LDo_3rd_div
+        /* prevent overflow */
+        subl    XsigH(%ebx),%edx
+        movl    %edx,FPU_accum_1
+        addl    $1,FPU_result_2 /* Reflect the subtraction in the answer */
+        adcl    $0,FPU_result_3
+LDo_3rd_div:
+        movl    FPU_accum_0,%eax
+        movl    FPU_accum_1,%edx
+        divl    XsigH(%ebx)
+        movl    %eax,FPU_result_1       /* Rough estimate of third word */
+        movl    PARAM3,%esi             /* pointer to answer */
+        movl    FPU_result_1,%eax
+        movl    %eax,XsigLL(%esi)
+        movl    FPU_result_2,%eax
+        movl    %eax,XsigL(%esi)
+        movl    FPU_result_3,%eax
+        movl    %eax,XsigH(%esi)
+L_exit:
+        popl    %ebx
+        popl    %edi
+        popl    %esi
+        leave
+        ret
+#ifdef PARANOID
+/* The logic is wrong if we got here */
+L_bugged:
+        pushl   EX_INTERNAL|0x240
+        call    EXCEPTION
+        pop     %ebx
+        jmp     L_exit
+L_bugged_1:
+        pushl   EX_INTERNAL|0x241
+        call    EXCEPTION
+        pop     %ebx
+        jmp     L_exit
+L_bugged_2:
+        pushl   EX_INTERNAL|0x242
+        call    EXCEPTION
+        pop     %ebx
+        jmp     L_exit
+#endif /* PARANOID */ 
diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S
new file mode 100644
index 000000000000..47099628fa4c
--- /dev/null
+++ b/arch/x86/math-emu/div_small.S
@@ -0,0 +1,47 @@
+        .file   "div_small.S"
+/*---------------------------------------------------------------------------+
+ |  div_small.S                                                              |
+ |                                                                           |
+ | Divide a 64 bit integer by a 32 bit integer & return remainder.           |
+ |                                                                           |
+ | Copyright (C) 1992,1995                                                   |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail billm@jacobi.maths.monash.edu.au |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ |    unsigned long FPU_div_small(unsigned long long *x, unsigned long y)    |
+ +---------------------------------------------------------------------------*/
+#include "fpu_emu.h"
+.text
+ENTRY(FPU_div_small)
+        pushl   %ebp
+        movl    %esp,%ebp
+        pushl   %esi
+        movl    PARAM1,%esi     /* pointer to num */
+        movl    PARAM2,%ecx     /* The denominator */
+        movl    4(%esi),%eax    /* Get the current num msw */
+        xorl    %edx,%edx
+        divl    %ecx
+        movl    %eax,4(%esi)
+        movl    (%esi),%eax     /* Get the num lsw */
+        divl    %ecx
+        movl    %eax,(%esi)
+        movl    %edx,%eax       /* Return the remainder in eax */
+        popl    %esi
+        leave
+        ret
diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c
new file mode 100644
index 000000000000..a1b0d22f6978
--- /dev/null
+++ b/arch/x86/math-emu/errors.c
@@ -0,0 +1,739 @@
+/*---------------------------------------------------------------------------+
+ |  errors.c                                                                 |
+ |                                                                           |
+ |  The error handling functions for wm-FPU-emu                              |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1996                                         |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@jacobi.maths.monash.edu.au                |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ | Note:                                                                     |
+ |    The file contains code which accesses user memory.                     |
+ |    Emulator static data may change when user memory is accessed, due to   |
+ |    other processes using the emulator while swapping is in progress.      |
+ +---------------------------------------------------------------------------*/
+#include <linux/signal.h>
+#include <asm/uaccess.h>
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "exception.h"
+#include "status_w.h"
+#include "control_w.h"
+#include "reg_constant.h"
+#include "version.h"
+/* */
+#undef PRINT_MESSAGES
+/* */
+#if 0
+void Un_impl(void)
+{
+  u_char byte1, FPU_modrm;
+  unsigned long address = FPU_ORIG_EIP;
+  RE_ENTRANT_CHECK_OFF;
+  /* No need to check access_ok(), we have previously fetched these bytes. */
+  printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *) address);
+  if ( FPU_CS == __USER_CS )
+    {
+      while ( 1 )
+        {
+          FPU_get_user(byte1, (u_char __user *) address);
+          if ( (byte1 & 0xf8) == 0xd8 ) break;
+          printk("[%02x]", byte1);
+          address++;
+        }
+      printk("%02x ", byte1);
+      FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
+      
+      if (FPU_modrm >= 0300)
+        printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7);
+      else
+        printk("/%d\n", (FPU_modrm >> 3) & 7);
+    }
+  else
+    {
+      printk("cs selector = %04x\n", FPU_CS);
+    }
+  RE_ENTRANT_CHECK_ON;
+  EXCEPTION(EX_Invalid);
+}
+#endif  /*  0  */
+/*
+   Called for opcodes which are illegal and which are known to result in a
+   SIGILL with a real 80486.
+   */
+void FPU_illegal(void)
+{
+  math_abort(FPU_info,SIGILL);
+}
+void FPU_printall(void)
+{
+  int i;
+  static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty",
+                              "DeNorm", "Inf", "NaN" };
+  u_char byte1, FPU_modrm;
+  unsigned long address = FPU_ORIG_EIP;
+  RE_ENTRANT_CHECK_OFF;
+  /* No need to check access_ok(), we have previously fetched these bytes. */
+  printk("At %p:", (void *) address);
+  if ( FPU_CS == __USER_CS )
+    {
+#define MAX_PRINTED_BYTES 20
+      for ( i = 0; i < MAX_PRINTED_BYTES; i++ )
+        {
+          FPU_get_user(byte1, (u_char __user *) address);
+          if ( (byte1 & 0xf8) == 0xd8 )
+            {
+              printk(" %02x", byte1);
+              break;
+            }
+          printk(" [%02x]", byte1);
+          address++;
+        }
+      if ( i == MAX_PRINTED_BYTES )
+        printk(" [more..]\n");
+      else
+        {
+          FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
+          
+          if (FPU_modrm >= 0300)
+            printk(" %02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7);
+          else
+            printk(" /%d, mod=%d rm=%d\n",
+                   (FPU_modrm >> 3) & 7, (FPU_modrm >> 6) & 3, FPU_modrm & 7);
+        }
+    }
+  else
+    {
+      printk("%04x\n", FPU_CS);
+    }
+  partial_status = status_word();
+#ifdef DEBUGGING
+if ( partial_status & SW_Backward )    printk("SW: backward compatibility\n");
+if ( partial_status & SW_C3 )          printk("SW: condition bit 3\n");
+if ( partial_status & SW_C2 )          printk("SW: condition bit 2\n");
+if ( partial_status & SW_C1 )          printk("SW: condition bit 1\n");
+if ( partial_status & SW_C0 )          printk("SW: condition bit 0\n");
+if ( partial_status & SW_Summary )     printk("SW: exception summary\n");
+if ( partial_status & SW_Stack_Fault ) printk("SW: stack fault\n");
+if ( partial_status & SW_Precision )   printk("SW: loss of precision\n");
+if ( partial_status & SW_Underflow )   printk("SW: underflow\n");
+if ( partial_status & SW_Overflow )    printk("SW: overflow\n");
+if ( partial_status & SW_Zero_Div )    printk("SW: divide by zero\n");
+if ( partial_status & SW_Denorm_Op )   printk("SW: denormalized operand\n");
+if ( partial_status & SW_Invalid )     printk("SW: invalid operation\n");
+#endif /* DEBUGGING */
+  printk(" SW: b=%d st=%ld es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n",
+         partial_status & 0x8000 ? 1 : 0,   /* busy */
+         (partial_status & 0x3800) >> 11,   /* stack top pointer */
+         partial_status & 0x80 ? 1 : 0,     /* Error summary status */
+         partial_status & 0x40 ? 1 : 0,     /* Stack flag */
+         partial_status & SW_C3?1:0, partial_status & SW_C2?1:0, /* cc */
+         partial_status & SW_C1?1:0, partial_status & SW_C0?1:0, /* cc */
+         partial_status & SW_Precision?1:0, partial_status & SW_Underflow?1:0,
+         partial_status & SW_Overflow?1:0, partial_status & SW_Zero_Div?1:0,
+         partial_status & SW_Denorm_Op?1:0, partial_status & SW_Invalid?1:0);
+  
+printk(" CW: ic=%d rc=%ld%ld pc=%ld%ld iem=%d     ef=%d%d%d%d%d%d\n",
+         control_word & 0x1000 ? 1 : 0,
+         (control_word & 0x800) >> 11, (control_word & 0x400) >> 10,
+         (control_word & 0x200) >> 9, (control_word & 0x100) >> 8,
+         control_word & 0x80 ? 1 : 0,
+         control_word & SW_Precision?1:0, control_word & SW_Underflow?1:0,
+         control_word & SW_Overflow?1:0, control_word & SW_Zero_Div?1:0,
+         control_word & SW_Denorm_Op?1:0, control_word & SW_Invalid?1:0);
+  for ( i = 0; i < 8; i++ )
+    {
+      FPU_REG *r = &st(i);
+      u_char tagi = FPU_gettagi(i);
+      switch (tagi)
+        {
+        case TAG_Empty:
+          continue;
+          break;
+        case TAG_Zero:
+        case TAG_Special:
+          tagi = FPU_Special(r);
+        case TAG_Valid:
+          printk("st(%d)  %c .%04lx %04lx %04lx %04lx e%+-6d ", i,
+                 getsign(r) ? '-' : '+',
+                 (long)(r->sigh >> 16),
+                 (long)(r->sigh & 0xFFFF),
+                 (long)(r->sigl >> 16),
+                 (long)(r->sigl & 0xFFFF),
+                 exponent(r) - EXP_BIAS + 1);
+          break;
+        default:
+          printk("Whoops! Error in errors.c: tag%d is %d ", i, tagi);
+          continue;
+          break;
+        }
+      printk("%s\n", tag_desc[(int) (unsigned) tagi]);
+    }
+  RE_ENTRANT_CHECK_ON;
+}
+static struct {
+  int type;
+  const char *name;
+} exception_names[] = {
+  { EX_StackOver, "stack overflow" },
+  { EX_StackUnder, "stack underflow" },
+  { EX_Precision, "loss of precision" },
+  { EX_Underflow, "underflow" },
+  { EX_Overflow, "overflow" },
+  { EX_ZeroDiv, "divide by zero" },
+  { EX_Denormal, "denormalized operand" },
+  { EX_Invalid, "invalid operation" },
+  { EX_INTERNAL, "INTERNAL BUG in "FPU_VERSION },
+  { 0, NULL }
+};
+/*
+ EX_INTERNAL is always given with a code which indicates where the
+ error was detected.
+ Internal error types:
+       0x14   in fpu_etc.c
+       0x1nn  in a *.c file:
+              0x101  in reg_add_sub.c
+              0x102  in reg_mul.c
+              0x104  in poly_atan.c
+              0x105  in reg_mul.c
+              0x107  in fpu_trig.c
+              0x108  in reg_compare.c
+              0x109  in reg_compare.c
+              0x110  in reg_add_sub.c
+              0x111  in fpe_entry.c
+              0x112  in fpu_trig.c
+              0x113  in errors.c
+              0x115  in fpu_trig.c
+              0x116  in fpu_trig.c
+              0x117  in fpu_trig.c
+              0x118  in fpu_trig.c
+              0x119  in fpu_trig.c
+              0x120  in poly_atan.c
+              0x121  in reg_compare.c
+              0x122  in reg_compare.c
+              0x123  in reg_compare.c
+              0x125  in fpu_trig.c
+              0x126  in fpu_entry.c
+              0x127  in poly_2xm1.c
+              0x128  in fpu_entry.c
+              0x129  in fpu_entry.c
+              0x130  in get_address.c
+              0x131  in get_address.c
+              0x132  in get_address.c
+              0x133  in get_address.c
+              0x140  in load_store.c
+              0x141  in load_store.c
+              0x150  in poly_sin.c
+              0x151  in poly_sin.c
+              0x160  in reg_ld_str.c
+              0x161  in reg_ld_str.c
+              0x162  in reg_ld_str.c
+              0x163  in reg_ld_str.c
+              0x164  in reg_ld_str.c
+              0x170  in fpu_tags.c
+              0x171  in fpu_tags.c
+              0x172  in fpu_tags.c
+              0x180  in reg_convert.c
+       0x2nn  in an *.S file:
+              0x201  in reg_u_add.S
+              0x202  in reg_u_div.S
+              0x203  in reg_u_div.S
+              0x204  in reg_u_div.S
+              0x205  in reg_u_mul.S
+              0x206  in reg_u_sub.S
+              0x207  in wm_sqrt.S
+              0x208  in reg_div.S
+              0x209  in reg_u_sub.S
+              0x210  in reg_u_sub.S
+              0x211  in reg_u_sub.S
+              0x212  in reg_u_sub.S
+              0x213  in wm_sqrt.S
+              0x214  in wm_sqrt.S
+              0x215  in wm_sqrt.S
+              0x220  in reg_norm.S
+              0x221  in reg_norm.S
+              0x230  in reg_round.S
+              0x231  in reg_round.S
+              0x232  in reg_round.S
+              0x233  in reg_round.S
+              0x234  in reg_round.S
+              0x235  in reg_round.S
+              0x236  in reg_round.S
+              0x240  in div_Xsig.S
+              0x241  in div_Xsig.S
+              0x242  in div_Xsig.S
+ */
+asmlinkage void FPU_exception(int n)
+{
+  int i, int_type;
+  int_type = 0;         /* Needed only to stop compiler warnings */
+  if ( n & EX_INTERNAL )
+    {
+      int_type = n - EX_INTERNAL;
+      n = EX_INTERNAL;
+      /* Set lots of exception bits! */
+      partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward);
+    }
+  else
+    {
+      /* Extract only the bits which we use to set the status word */
+      n &= (SW_Exc_Mask);
+      /* Set the corresponding exception bit */
+      partial_status |= n;
+      /* Set summary bits iff exception isn't masked */
+      if ( partial_status & ~control_word & CW_Exceptions )
+        partial_status |= (SW_Summary | SW_Backward);
+      if ( n & (SW_Stack_Fault | EX_Precision) )
+        {
+          if ( !(n & SW_C1) )
+            /* This bit distinguishes over- from underflow for a stack fault,
+               and roundup from round-down for precision loss. */
+            partial_status &= ~SW_C1;
+        }
+    }
+  RE_ENTRANT_CHECK_OFF;
+  if ( (~control_word & n & CW_Exceptions) || (n == EX_INTERNAL) )
+    {
+#ifdef PRINT_MESSAGES
+      /* My message from the sponsor */
+      printk(FPU_VERSION" "__DATE__" (C) W. Metzenthen.\n");
+#endif /* PRINT_MESSAGES */
+      
+      /* Get a name string for error reporting */
+      for (i=0; exception_names[i].type; i++)
+        if ( (exception_names[i].type & n) == exception_names[i].type )
+          break;
+      
+      if (exception_names[i].type)
+        {
+#ifdef PRINT_MESSAGES
+          printk("FP Exception: %s!\n", exception_names[i].name);
+#endif /* PRINT_MESSAGES */
+        }
+      else
+        printk("FPU emulator: Unknown Exception: 0x%04x!\n", n);
+      
+      if ( n == EX_INTERNAL )
+        {
+          printk("FPU emulator: Internal error type 0x%04x\n", int_type);
+          FPU_printall();
+        }
+#ifdef PRINT_MESSAGES
+      else
+        FPU_printall();
+#endif /* PRINT_MESSAGES */
+      /*
+       * The 80486 generates an interrupt on the next non-control FPU
+       * instruction. So we need some means of flagging it.
+       * We use the ES (Error Summary) bit for this.
+       */
+    }
+  RE_ENTRANT_CHECK_ON;
+#ifdef __DEBUG__
+  math_abort(FPU_info,SIGFPE);
+#endif /* __DEBUG__ */
+}
+/* Real operation attempted on a NaN. */
+/* Returns < 0 if the exception is unmasked */
+int real_1op_NaN(FPU_REG *a)
+{
+  int signalling, isNaN;
+  isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000);
+  /* The default result for the case of two "equal" NaNs (signs may
+     differ) is chosen to reproduce 80486 behaviour */
+  signalling = isNaN && !(a->sigh & 0x40000000);
+  if ( !signalling )
+    {
+      if ( !isNaN )  /* pseudo-NaN, or other unsupported? */
+        {
+          if ( control_word & CW_Invalid )
+            {
+              /* Masked response */
+              reg_copy(&CONST_QNaN, a);
+            }
+          EXCEPTION(EX_Invalid);
+          return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
+        }
+      return TAG_Special;
+    }
+  if ( control_word & CW_Invalid )
+    {
+      /* The masked response */
+      if ( !(a->sigh & 0x80000000) )  /* pseudo-NaN ? */
+        {
+          reg_copy(&CONST_QNaN, a);
+        }
+      /* ensure a Quiet NaN */
+      a->sigh |= 0x40000000;
+    }
+  EXCEPTION(EX_Invalid);
+  return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
+}
+/* Real operation attempted on two operands, one a NaN. */
+/* Returns < 0 if the exception is unmasked */
+int real_2op_NaN(FPU_REG const *b, u_char tagb,
+                 int deststnr,
+                 FPU_REG const *defaultNaN)
+{
+  FPU_REG *dest = &st(deststnr);
+  FPU_REG const *a = dest;
+  u_char taga = FPU_gettagi(deststnr);
+  FPU_REG const *x;
+  int signalling, unsupported;
+  if ( taga == TAG_Special )
+    taga = FPU_Special(a);
+  if ( tagb == TAG_Special )
+    tagb = FPU_Special(b);
+  /* TW_NaN is also used for unsupported data types. */
+  unsupported = ((taga == TW_NaN)
+                 && !((exponent(a) == EXP_OVER) && (a->sigh & 0x80000000)))
+    || ((tagb == TW_NaN)
+        && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000)));
+  if ( unsupported )
+    {
+      if ( control_word & CW_Invalid )
+        {
+          /* Masked response */
+          FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
+        }
+      EXCEPTION(EX_Invalid);
+      return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
+    }
+  if (taga == TW_NaN)
+    {
+      x = a;
+      if (tagb == TW_NaN)
+        {
+          signalling = !(a->sigh & b->sigh & 0x40000000);
+          if ( significand(b) > significand(a) )
+            x = b;
+          else if ( significand(b) == significand(a) )
+            {
+              /* The default result for the case of two "equal" NaNs (signs may
+                 differ) is chosen to reproduce 80486 behaviour */
+              x = defaultNaN;
+            }
+        }
+      else
+        {
+          /* return the quiet version of the NaN in a */
+          signalling = !(a->sigh & 0x40000000);
+        }
+    }
+  else
+#ifdef PARANOID
+    if (tagb == TW_NaN)
+#endif /* PARANOID */
+    {
+      signalling = !(b->sigh & 0x40000000);
+      x = b;
+    }
+#ifdef PARANOID
+  else
+    {
+      signalling = 0;
+      EXCEPTION(EX_INTERNAL|0x113);
+      x = &CONST_QNaN;
+    }
+#endif /* PARANOID */
+  if ( (!signalling) || (control_word & CW_Invalid) )
+    {
+      if ( ! x )
+        x = b;
+      if ( !(x->sigh & 0x80000000) )  /* pseudo-NaN ? */
+        x = &CONST_QNaN;
+      FPU_copy_to_regi(x, TAG_Special, deststnr);
+      if ( !signalling )
+        return TAG_Special;
+      /* ensure a Quiet NaN */
+      dest->sigh |= 0x40000000;
+    }
+  EXCEPTION(EX_Invalid);
+  return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
+}
+/* Invalid arith operation on Valid registers */
+/* Returns < 0 if the exception is unmasked */
+asmlinkage int arith_invalid(int deststnr)
+{
+  EXCEPTION(EX_Invalid);
+  
+  if ( control_word & CW_Invalid )
+    {
+      /* The masked response */
+      FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
+    }
+  
+  return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid;
+}
+/* Divide a finite number by zero */
+asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign)
+{
+  FPU_REG *dest = &st(deststnr);
+  int tag = TAG_Valid;
+  if ( control_word & CW_ZeroDiv )
+    {
+      /* The masked response */
+      FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr);
+      setsign(dest, sign);
+      tag = TAG_Special;
+    }
+ 
+  EXCEPTION(EX_ZeroDiv);
+  return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag;
+}
+/* This may be called often, so keep it lean */
+int set_precision_flag(int flags)
+{
+  if ( control_word & CW_Precision )
+    {
+      partial_status &= ~(SW_C1 & flags);
+      partial_status |= flags;   /* The masked response */
+      return 0;
+    }
+  else
+    {
+      EXCEPTION(flags);
+      return 1;
+    }
+}
+/* This may be called often, so keep it lean */
+asmlinkage void set_precision_flag_up(void)
+{
+  if ( control_word & CW_Precision )
+    partial_status |= (SW_Precision | SW_C1);   /* The masked response */
+  else
+    EXCEPTION(EX_Precision | SW_C1);
+}
+/* This may be called often, so keep it lean */
+asmlinkage void set_precision_flag_down(void)
+{
+  if ( control_word & CW_Precision )
+    {   /* The masked response */
+      partial_status &= ~SW_C1;
+      partial_status |= SW_Precision;
+    }
+  else
+    EXCEPTION(EX_Precision);
+}
+asmlinkage int denormal_operand(void)
+{
+  if ( control_word & CW_Denormal )
+    {   /* The masked response */
+      partial_status |= SW_Denorm_Op;
+      return TAG_Special;
+    }
+  else
+    {
+      EXCEPTION(EX_Denormal);
+      return TAG_Special | FPU_Exception;
+    }
+}
+asmlinkage int arith_overflow(FPU_REG *dest)
+{
+  int tag = TAG_Valid;
+  if ( control_word & CW_Overflow )
+    {
+      /* The masked response */
+/* ###### The response here depends upon the rounding mode */
+      reg_copy(&CONST_INF, dest);
+      tag = TAG_Special;
+    }
+  else
+    {
+      /* Subtract the magic number from the exponent */
+      addexponent(dest, (-3 * (1 << 13)));
+    }
+  EXCEPTION(EX_Overflow);
+  if ( control_word & CW_Overflow )
+    {
+      /* The overflow exception is masked. */
+      /* By definition, precision is lost.
+         The roundup bit (C1) is also set because we have
+         "rounded" upwards to Infinity. */
+      EXCEPTION(EX_Precision | SW_C1);
+      return tag;
+    }
+  return tag;
+}
+asmlinkage int arith_underflow(FPU_REG *dest)
+{
+  int tag = TAG_Valid;
+  if ( control_word & CW_Underflow )
+    {
+      /* The masked response */
+      if ( exponent16(dest) <= EXP_UNDER - 63 )
+        {
+          reg_copy(&CONST_Z, dest);
+          partial_status &= ~SW_C1;       /* Round down. */
+          tag = TAG_Zero;
+        }
+      else
+        {
+          stdexp(dest);
+        }
+    }
+  else
+    {
+      /* Add the magic number to the exponent. */
+      addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias);
+    }
+  EXCEPTION(EX_Underflow);
+  if ( control_word & CW_Underflow )
+    {
+      /* The underflow exception is masked. */
+      EXCEPTION(EX_Precision);
+      return tag;
+    }
+  return tag;
+}
+void FPU_stack_overflow(void)
+{
+ if ( control_word & CW_Invalid )
+    {
+      /* The masked response */
+      top--;
+      FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+    }
+  EXCEPTION(EX_StackOver);
+  return;
+}
+void FPU_stack_underflow(void)
+{
+ if ( control_word & CW_Invalid )
+    {
+      /* The masked response */
+      FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+    }
+  EXCEPTION(EX_StackUnder);
+  return;
+}
+void FPU_stack_underflow_i(int i)
+{
+ if ( control_word & CW_Invalid )
+    {
+      /* The masked response */
+      FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
+    }
+  EXCEPTION(EX_StackUnder);
+  return;
+}
+void FPU_stack_underflow_pop(int i)
+{
+ if ( control_word & CW_Invalid )
+    {
+      /* The masked response */
+      FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
+      FPU_pop();
+    }
+  EXCEPTION(EX_StackUnder);
+  return;
+}
diff --git a/arch/x86/math-emu/exception.h b/arch/x86/math-emu/exception.h
new file mode 100644
index 000000000000..b463f21a811e
--- /dev/null
+++ b/arch/x86/math-emu/exception.h
@@ -0,0 +1,53 @@
+/*---------------------------------------------------------------------------+
+ |  exception.h                                                              |
+ |                                                                           |
+ | Copyright (C) 1992    W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail   billm@vaxc.cc.monash.edu.au    |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#ifndef _EXCEPTION_H_
+#define _EXCEPTION_H_
+#ifdef __ASSEMBLY__
+#define Const_(x)       $##x
+#else
+#define Const_(x)       x
+#endif
+#ifndef SW_C1
+#include "fpu_emu.h"
+#endif /* SW_C1 */
+#define FPU_BUSY        Const_(0x8000)   /* FPU busy bit (8087 compatibility) */
+#define EX_ErrorSummary Const_(0x0080)   /* Error summary status */
+/* Special exceptions: */
+#define EX_INTERNAL     Const_(0x8000)  /* Internal error in wm-FPU-emu */
+#define EX_StackOver    Const_(0x0041|SW_C1)    /* stack overflow */
+#define EX_StackUnder   Const_(0x0041)  /* stack underflow */
+/* Exception flags: */
+#define EX_Precision    Const_(0x0020)  /* loss of precision */
+#define EX_Underflow    Const_(0x0010)  /* underflow */
+#define EX_Overflow     Const_(0x0008)  /* overflow */
+#define EX_ZeroDiv      Const_(0x0004)  /* divide by zero */
+#define EX_Denormal     Const_(0x0002)  /* denormalized operand */
+#define EX_Invalid      Const_(0x0001)  /* invalid operation */
+#define PRECISION_LOST_UP    Const_((EX_Precision | SW_C1))
+#define PRECISION_LOST_DOWN  Const_(EX_Precision)
+#ifndef __ASSEMBLY__
+#ifdef DEBUG
+#define EXCEPTION(x)    { printk("exception in %s at line %d\n", \
+        __FILE__, __LINE__); FPU_exception(x); }
+#else
+#define EXCEPTION(x)    FPU_exception(x)
+#endif
+#endif /* __ASSEMBLY__ */ 
+#endif /* _EXCEPTION_H_ */
diff --git a/arch/x86/math-emu/fpu_arith.c b/arch/x86/math-emu/fpu_arith.c
new file mode 100644
index 000000000000..6972dec01af6
--- /dev/null
+++ b/arch/x86/math-emu/fpu_arith.c
@@ -0,0 +1,174 @@
+/*---------------------------------------------------------------------------+
+ |  fpu_arith.c                                                              |
+ |                                                                           |
+ | Code to implement the FPU register/register arithmetic instructions       |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1997                                              |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "fpu_system.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+#include "status_w.h"
+void fadd__(void)
+{
+  /* fadd st,st(i) */
+  int i = FPU_rm;
+  clear_C1();
+  FPU_add(&st(i), FPU_gettagi(i), 0, control_word);
+}
+void fmul__(void)
+{
+  /* fmul st,st(i) */
+  int i = FPU_rm;
+  clear_C1();
+  FPU_mul(&st(i), FPU_gettagi(i), 0, control_word);
+}
+void fsub__(void)
+{
+  /* fsub st,st(i) */
+  clear_C1();
+  FPU_sub(0, FPU_rm, control_word);
+}
+void fsubr_(void)
+{
+  /* fsubr st,st(i) */
+  clear_C1();
+  FPU_sub(REV, FPU_rm, control_word);
+}
+void fdiv__(void)
+{
+  /* fdiv st,st(i) */
+  clear_C1();
+  FPU_div(0, FPU_rm, control_word);
+}
+void fdivr_(void)
+{
+  /* fdivr st,st(i) */
+  clear_C1();
+  FPU_div(REV, FPU_rm, control_word);
+}
+void fadd_i(void)
+{
+  /* fadd st(i),st */
+  int i = FPU_rm;
+  clear_C1();
+  FPU_add(&st(i), FPU_gettagi(i), i, control_word);
+}
+void fmul_i(void)
+{
+  /* fmul st(i),st */
+  clear_C1();
+  FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word);
+}
+void fsubri(void)
+{
+  /* fsubr st(i),st */
+  clear_C1();
+  FPU_sub(DEST_RM, FPU_rm, control_word);
+}
+void fsub_i(void)
+{
+  /* fsub st(i),st */
+  clear_C1();
+  FPU_sub(REV|DEST_RM, FPU_rm, control_word);
+}
+void fdivri(void)
+{
+  /* fdivr st(i),st */
+  clear_C1();
+  FPU_div(DEST_RM, FPU_rm, control_word);
+}
+void fdiv_i(void)
+{
+  /* fdiv st(i),st */
+  clear_C1();
+  FPU_div(REV|DEST_RM, FPU_rm, control_word);
+}
+void faddp_(void)
+{
+  /* faddp st(i),st */
+  int i = FPU_rm;
+  clear_C1();
+  if ( FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0 )
+    FPU_pop();
+}
+void fmulp_(void)
+{
+  /* fmulp st(i),st */
+  clear_C1();
+  if ( FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0 )
+    FPU_pop();
+}
+void fsubrp(void)
+{
+  /* fsubrp st(i),st */
+  clear_C1();
+  if ( FPU_sub(DEST_RM, FPU_rm, control_word) >= 0 )
+    FPU_pop();
+}
+void fsubp_(void)
+{
+  /* fsubp st(i),st */
+  clear_C1();
+  if ( FPU_sub(REV|DEST_RM, FPU_rm, control_word) >= 0 )
+    FPU_pop();
+}
+void fdivrp(void)
+{
+  /* fdivrp st(i),st */
+  clear_C1();
+  if ( FPU_div(DEST_RM, FPU_rm, control_word) >= 0 )
+    FPU_pop();
+}
+void fdivp_(void)
+{
+  /* fdivp st(i),st */
+  clear_C1();
+  if ( FPU_div(REV|DEST_RM, FPU_rm, control_word) >= 0 )
+    FPU_pop();
+}
diff --git a/arch/x86/math-emu/fpu_asm.h b/arch/x86/math-emu/fpu_asm.h
new file mode 100644
index 000000000000..9ba12416df12
--- /dev/null
+++ b/arch/x86/math-emu/fpu_asm.h
@@ -0,0 +1,32 @@
+/*---------------------------------------------------------------------------+
+ |  fpu_asm.h                                                                |
+ |                                                                           |
+ | Copyright (C) 1992,1995,1997                                              |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail billm@suburbia.net               |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#ifndef _FPU_ASM_H_
+#define _FPU_ASM_H_
+#include <linux/linkage.h>
+#define EXCEPTION       FPU_exception
+#define PARAM1  8(%ebp)
+#define PARAM2  12(%ebp)
+#define PARAM3  16(%ebp)
+#define PARAM4  20(%ebp)
+#define PARAM5  24(%ebp)
+#define PARAM6  28(%ebp)
+#define PARAM7  32(%ebp)
+#define SIGL_OFFSET 0
+#define EXP(x)  8(x)
+#define SIG(x)  SIGL_OFFSET##(x)
+#define SIGL(x) SIGL_OFFSET##(x)
+#define SIGH(x) 4(x)
+#endif /* _FPU_ASM_H_ */
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
new file mode 100644
index 000000000000..20886cfb9f76
--- /dev/null
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -0,0 +1,204 @@
+/*---------------------------------------------------------------------------+
+ |  fpu_aux.c                                                                |
+ |                                                                           |
+ | Code to implement some of the FPU auxiliary instructions.                 |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1997                                         |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "fpu_system.h"
+#include "exception.h"
+#include "fpu_emu.h"
+#include "status_w.h"
+#include "control_w.h"
+static void fnop(void)
+{
+}
+static void fclex(void)
+{
+  partial_status &= ~(SW_Backward|SW_Summary|SW_Stack_Fault|SW_Precision|
+                   SW_Underflow|SW_Overflow|SW_Zero_Div|SW_Denorm_Op|
+                   SW_Invalid);
+  no_ip_update = 1;
+}
+/* Needs to be externally visible */
+void finit(void)
+{
+  control_word = 0x037f;
+  partial_status = 0;
+  top = 0;            /* We don't keep top in the status word internally. */
+  fpu_tag_word = 0xffff;
+  /* The behaviour is different from that detailed in
+     Section 15.1.6 of the Intel manual */
+  operand_address.offset = 0;
+  operand_address.selector = 0;
+  instruction_address.offset = 0;
+  instruction_address.selector = 0;
+  instruction_address.opcode = 0;
+  no_ip_update = 1;
+}
+/*
+ * These are nops on the i387..
+ */
+#define feni fnop
+#define fdisi fnop
+#define fsetpm fnop
+static FUNC const finit_table[] = {
+  feni, fdisi, fclex, finit,
+  fsetpm, FPU_illegal, FPU_illegal, FPU_illegal
+};
+void finit_(void)
+{
+  (finit_table[FPU_rm])();
+}
+static void fstsw_ax(void)
+{
+  *(short *) &FPU_EAX = status_word();
+  no_ip_update = 1;
+}
+static FUNC const fstsw_table[] = {
+  fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal,
+  FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
+};
+void fstsw_(void)
+{
+  (fstsw_table[FPU_rm])();
+}
+static FUNC const fp_nop_table[] = {
+  fnop, FPU_illegal, FPU_illegal, FPU_illegal,
+  FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
+};
+void fp_nop(void)
+{
+  (fp_nop_table[FPU_rm])();
+}
+void fld_i_(void)
+{
+  FPU_REG *st_new_ptr;
+  int i;
+  u_char tag;
+  if ( STACK_OVERFLOW )
+    { FPU_stack_overflow(); return; }
+  /* fld st(i) */
+  i = FPU_rm;
+  if ( NOT_EMPTY(i) )
+    {
+      reg_copy(&st(i), st_new_ptr);
+      tag = FPU_gettagi(i);
+      push();
+      FPU_settag0(tag);
+    }
+  else
+    {
+      if ( control_word & CW_Invalid )
+        {
+          /* The masked response */
+          FPU_stack_underflow();
+        }
+      else
+        EXCEPTION(EX_StackUnder);
+    }
+}
+void fxch_i(void)
+{
+  /* fxch st(i) */
+  FPU_REG t;
+  int i = FPU_rm;
+  FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i);
+  long tag_word = fpu_tag_word;
+  int regnr = top & 7, regnri = ((regnr + i) & 7);
+  u_char st0_tag = (tag_word >> (regnr*2)) & 3;
+  u_char sti_tag = (tag_word >> (regnri*2)) & 3;
+  if ( st0_tag == TAG_Empty )
+    {
+      if ( sti_tag == TAG_Empty )
+        {
+          FPU_stack_underflow();
+          FPU_stack_underflow_i(i);
+          return;
+        }
+      if ( control_word & CW_Invalid )
+        {
+          /* Masked response */
+          FPU_copy_to_reg0(sti_ptr, sti_tag);
+        }
+      FPU_stack_underflow_i(i);
+      return;
+    }
+  if ( sti_tag == TAG_Empty )
+    {
+      if ( control_word & CW_Invalid )
+        {
+          /* Masked response */
+          FPU_copy_to_regi(st0_ptr, st0_tag, i);
+        }
+      FPU_stack_underflow();
+      return;
+    }
+  clear_C1();
+  reg_copy(st0_ptr, &t);
+  reg_copy(sti_ptr, st0_ptr);
+  reg_copy(&t, sti_ptr);
+  tag_word &= ~(3 << (regnr*2)) & ~(3 << (regnri*2));
+  tag_word |= (sti_tag << (regnr*2)) | (st0_tag << (regnri*2));
+  fpu_tag_word = tag_word;
+}
+void ffree_(void)
+{
+  /* ffree st(i) */
+  FPU_settagi(FPU_rm, TAG_Empty);
+}
+void ffreep(void)
+{
+  /* ffree st(i) + pop - unofficial code */
+  FPU_settagi(FPU_rm, TAG_Empty);
+  FPU_pop();
+}
+void fst_i_(void)
+{
+  /* fst st(i) */
+  FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
+}
+void fstp_i(void)
+{
+  /* fstp st(i) */
+  FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
+  FPU_pop();
+}
diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h
new file mode 100644
index 000000000000..65120f523853
--- /dev/null
+++ b/arch/x86/math-emu/fpu_emu.h
@@ -0,0 +1,218 @@
+/*---------------------------------------------------------------------------+
+ |  fpu_emu.h                                                                |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1997                                         |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail   billm@suburbia.net             |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#ifndef _FPU_EMU_H_
+#define _FPU_EMU_H_
+/*
+ * Define PECULIAR_486 to get a closer approximation to 80486 behaviour,
+ * rather than behaviour which appears to be cleaner.
+ * This is a matter of opinion: for all I know, the 80486 may simply
+ * be complying with the IEEE spec. Maybe one day I'll get to see the
+ * spec...
+ */
+#define PECULIAR_486
+#ifdef __ASSEMBLY__
+#include "fpu_asm.h"
+#define Const(x)        $##x
+#else
+#define Const(x)        x
+#endif
+#define EXP_BIAS        Const(0)
+#define EXP_OVER        Const(0x4000)    /* smallest invalid large exponent */
+#define EXP_UNDER       Const(-0x3fff)   /* largest invalid small exponent */
+#define EXP_WAY_UNDER   Const(-0x6000)   /* Below the smallest denormal, but
+                                            still a 16 bit nr. */
+#define EXP_Infinity    EXP_OVER
+#define EXP_NaN         EXP_OVER
+#define EXTENDED_Ebias Const(0x3fff)
+#define EXTENDED_Emin (-0x3ffe)  /* smallest valid exponent */
+#define SIGN_POS        Const(0)
+#define SIGN_NEG        Const(0x80)
+#define SIGN_Positive   Const(0)
+#define SIGN_Negative   Const(0x8000)
+/* Keep the order TAG_Valid, TAG_Zero, TW_Denormal */
+/* The following fold to 2 (Special) in the Tag Word */
+#define TW_Denormal     Const(4)        /* De-normal */
+#define TW_Infinity     Const(5)        /* + or - infinity */
+#define TW_NaN          Const(6)        /* Not a Number */
+#define TW_Unsupported  Const(7)        /* Not supported by an 80486 */
+#define TAG_Valid       Const(0)        /* valid */
+#define TAG_Zero        Const(1)        /* zero */
+#define TAG_Special     Const(2)        /* De-normal, + or - infinity,
+                                           or Not a Number */
+#define TAG_Empty       Const(3)        /* empty */
+#define TAG_Error       Const(0x80)     /* probably need to abort */
+#define LOADED_DATA     Const(10101)    /* Special st() number to identify
+                                           loaded data (not on stack). */
+/* A few flags (must be >= 0x10). */
+#define REV             0x10
+#define DEST_RM         0x20
+#define LOADED          0x40
+#define FPU_Exception   Const(0x80000000)   /* Added to tag returns. */
+#ifndef __ASSEMBLY__
+#include "fpu_system.h"
+#include <asm/sigcontext.h>   /* for struct _fpstate */
+#include <asm/math_emu.h>
+#include <linux/linkage.h>
+/*
+#define RE_ENTRANT_CHECKING
+ */
+#ifdef RE_ENTRANT_CHECKING
+extern u_char emulating;
+#  define RE_ENTRANT_CHECK_OFF emulating = 0
+#  define RE_ENTRANT_CHECK_ON emulating = 1
+#else
+#  define RE_ENTRANT_CHECK_OFF
+#  define RE_ENTRANT_CHECK_ON
+#endif /* RE_ENTRANT_CHECKING */
+#define FWAIT_OPCODE 0x9b
+#define OP_SIZE_PREFIX 0x66
+#define ADDR_SIZE_PREFIX 0x67
+#define PREFIX_CS 0x2e
+#define PREFIX_DS 0x3e
+#define PREFIX_ES 0x26
+#define PREFIX_SS 0x36
+#define PREFIX_FS 0x64
+#define PREFIX_GS 0x65
+#define PREFIX_REPE 0xf3
+#define PREFIX_REPNE 0xf2
+#define PREFIX_LOCK 0xf0
+#define PREFIX_CS_ 1
+#define PREFIX_DS_ 2
+#define PREFIX_ES_ 3
+#define PREFIX_FS_ 4
+#define PREFIX_GS_ 5
+#define PREFIX_SS_ 6
+#define PREFIX_DEFAULT 7
+struct address {
+  unsigned int offset;
+  unsigned int selector:16;
+  unsigned int opcode:11;
+  unsigned int empty:5;
+};
+struct fpu__reg {
+  unsigned sigl;
+  unsigned sigh;
+  short exp;
+};
+typedef void (*FUNC)(void);
+typedef struct fpu__reg FPU_REG;
+typedef void (*FUNC_ST0)(FPU_REG *st0_ptr, u_char st0_tag);
+typedef struct { u_char address_size, operand_size, segment; }
+        overrides;
+/* This structure is 32 bits: */
+typedef struct { overrides override;
+                 u_char default_mode; } fpu_addr_modes;
+/* PROTECTED has a restricted meaning in the emulator; it is used
+   to signal that the emulator needs to do special things to ensure
+   that protection is respected in a segmented model. */
+#define PROTECTED 4
+#define SIXTEEN   1         /* We rely upon this being 1 (true) */
+#define VM86      SIXTEEN
+#define PM16      (SIXTEEN | PROTECTED)
+#define SEG32     PROTECTED
+extern u_char const data_sizes_16[32];
+#define register_base ((u_char *) registers )
+#define fpu_register(x)  ( * ((FPU_REG *)( register_base + 10 * (x & 7) )) )
+#define st(x)      ( * ((FPU_REG *)( register_base + 10 * ((top+x) & 7) )) )
+#define STACK_OVERFLOW  (FPU_stackoverflow(&st_new_ptr))
+#define NOT_EMPTY(i)    (!FPU_empty_i(i))
+#define NOT_EMPTY_ST0   (st0_tag ^ TAG_Empty)
+#define poppop() { FPU_pop(); FPU_pop(); }
+/* push() does not affect the tags */
+#define push()  { top--; }
+#define signbyte(a) (((u_char *)(a))[9])
+#define getsign(a) (signbyte(a) & 0x80)
+#define setsign(a,b) { if (b) signbyte(a) |= 0x80; else signbyte(a) &= 0x7f; }
+#define copysign(a,b) { if (getsign(a)) signbyte(b) |= 0x80; \
+                        else signbyte(b) &= 0x7f; }
+#define changesign(a) { signbyte(a) ^= 0x80; }
+#define setpositive(a) { signbyte(a) &= 0x7f; }
+#define setnegative(a) { signbyte(a) |= 0x80; }
+#define signpositive(a) ( (signbyte(a) & 0x80) == 0 )
+#define signnegative(a) (signbyte(a) & 0x80)
+static inline void reg_copy(FPU_REG const *x, FPU_REG *y)
+{
+  *(short *)&(y->exp) = *(const short *)&(x->exp); 
+  *(long long *)&(y->sigl) = *(const long long *)&(x->sigl);
+}
+#define exponent(x)  (((*(short *)&((x)->exp)) & 0x7fff) - EXTENDED_Ebias)
+#define setexponentpos(x,y) { (*(short *)&((x)->exp)) = \
+  ((y) + EXTENDED_Ebias) & 0x7fff; }
+#define exponent16(x)         (*(short *)&((x)->exp))
+#define setexponent16(x,y)  { (*(short *)&((x)->exp)) = (y); }
+#define addexponent(x,y)    { (*(short *)&((x)->exp)) += (y); }
+#define stdexp(x)           { (*(short *)&((x)->exp)) += EXTENDED_Ebias; }
+#define isdenormal(ptr)   (exponent(ptr) == EXP_BIAS+EXP_UNDER)
+#define significand(x) ( ((unsigned long long *)&((x)->sigl))[0] )
+/*----- Prototypes for functions written in assembler -----*/
+/* extern void reg_move(FPU_REG *a, FPU_REG *b); */
+asmlinkage int FPU_normalize(FPU_REG *x);
+asmlinkage int FPU_normalize_nuo(FPU_REG *x);
+asmlinkage int FPU_u_sub(FPU_REG const *arg1, FPU_REG const *arg2,
+                         FPU_REG *answ, unsigned int control_w, u_char sign,
+                         int expa, int expb);
+asmlinkage int FPU_u_mul(FPU_REG const *arg1, FPU_REG const *arg2,
+                         FPU_REG *answ, unsigned int control_w, u_char sign,
+                         int expon);
+asmlinkage int FPU_u_div(FPU_REG const *arg1, FPU_REG const *arg2,
+                         FPU_REG *answ, unsigned int control_w, u_char sign);
+asmlinkage int FPU_u_add(FPU_REG const *arg1, FPU_REG const *arg2,
+                         FPU_REG *answ, unsigned int control_w, u_char sign,
+                         int expa, int expb);
+asmlinkage int wm_sqrt(FPU_REG *n, int dummy1, int dummy2,
+                       unsigned int control_w, u_char sign);
+asmlinkage unsigned     FPU_shrx(void *l, unsigned x);
+asmlinkage unsigned     FPU_shrxs(void *v, unsigned x);
+asmlinkage unsigned long FPU_div_small(unsigned long long *x, unsigned long y);
+asmlinkage int FPU_round(FPU_REG *arg, unsigned int extent, int dummy,
+                         unsigned int control_w, u_char sign);
+#ifndef MAKING_PROTO
+#include "fpu_proto.h"
+#endif
+#endif /* __ASSEMBLY__ */
+#endif /* _FPU_EMU_H_ */
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
new file mode 100644
index 000000000000..1853524c8b57
--- /dev/null
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -0,0 +1,761 @@
+/*---------------------------------------------------------------------------+
+ |  fpu_entry.c                                                              |
+ |                                                                           |
+ | The entry functions for wm-FPU-emu                                        |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1996,1997                                    |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ | See the files "README" and "COPYING" for further copyright and warranty   |
+ | information.                                                              |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ | Note:                                                                     |
+ |    The file contains code which accesses user memory.                     |
+ |    Emulator static data may change when user memory is accessed, due to   |
+ |    other processes using the emulator while swapping is in progress.      |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ | math_emulate(), restore_i387_soft() and save_i387_soft() are the only     |
+ | entry points for wm-FPU-emu.                                              |
+ +---------------------------------------------------------------------------*/
+#include <linux/signal.h>
+#include <linux/ptrace.h>
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+#include "fpu_system.h"
+#include "fpu_emu.h"
+#include "exception.h"
+#include "control_w.h"
+#include "status_w.h"
+#define __BAD__ FPU_illegal   /* Illegal on an 80486, causes SIGILL */
+#ifndef NO_UNDOC_CODE    /* Un-documented FPU op-codes supported by default. */
+/* WARNING: These codes are not documented by Intel in their 80486 manual
+   and may not work on FPU clones or later Intel FPUs. */
+/* Changes to support the un-doc codes provided by Linus Torvalds. */
+#define _d9_d8_ fstp_i    /* unofficial code (19) */
+#define _dc_d0_ fcom_st   /* unofficial code (14) */
+#define _dc_d8_ fcompst   /* unofficial code (1c) */
+#define _dd_c8_ fxch_i    /* unofficial code (0d) */
+#define _de_d0_ fcompst   /* unofficial code (16) */
+#define _df_c0_ ffreep    /* unofficial code (07) ffree + pop */
+#define _df_c8_ fxch_i    /* unofficial code (0f) */
+#define _df_d0_ fstp_i    /* unofficial code (17) */
+#define _df_d8_ fstp_i    /* unofficial code (1f) */
+static FUNC const st_instr_table[64] = {
+  fadd__,   fld_i_,     __BAD__, __BAD__, fadd_i,  ffree_,  faddp_,  _df_c0_,
+  fmul__,   fxch_i,     __BAD__, __BAD__, fmul_i,  _dd_c8_, fmulp_,  _df_c8_,
+  fcom_st,  fp_nop,     __BAD__, __BAD__, _dc_d0_, fst_i_,  _de_d0_, _df_d0_,
+  fcompst,  _d9_d8_,    __BAD__, __BAD__, _dc_d8_, fstp_i,  fcompp,  _df_d8_,
+  fsub__,   FPU_etc,    __BAD__, finit_,  fsubri,  fucom_,  fsubrp,  fstsw_,
+  fsubr_,   fconst,     fucompp, __BAD__, fsub_i,  fucomp,  fsubp_,  __BAD__,
+  fdiv__,   FPU_triga,  __BAD__, __BAD__, fdivri,  __BAD__, fdivrp,  __BAD__,
+  fdivr_,   FPU_trigb,  __BAD__, __BAD__, fdiv_i,  __BAD__, fdivp_,  __BAD__,
+};
+#else     /* Support only documented FPU op-codes */
+static FUNC const st_instr_table[64] = {
+  fadd__,   fld_i_,     __BAD__, __BAD__, fadd_i,  ffree_,  faddp_,  __BAD__,
+  fmul__,   fxch_i,     __BAD__, __BAD__, fmul_i,  __BAD__, fmulp_,  __BAD__,
+  fcom_st,  fp_nop,     __BAD__, __BAD__, __BAD__, fst_i_,  __BAD__, __BAD__,
+  fcompst,  __BAD__,    __BAD__, __BAD__, __BAD__, fstp_i,  fcompp,  __BAD__,
+  fsub__,   FPU_etc,    __BAD__, finit_,  fsubri,  fucom_,  fsubrp,  fstsw_,
+  fsubr_,   fconst,     fucompp, __BAD__, fsub_i,  fucomp,  fsubp_,  __BAD__,
+  fdiv__,   FPU_triga,  __BAD__, __BAD__, fdivri,  __BAD__, fdivrp,  __BAD__,
+  fdivr_,   FPU_trigb,  __BAD__, __BAD__, fdiv_i,  __BAD__, fdivp_,  __BAD__,
+};
+#endif /* NO_UNDOC_CODE */
+#define _NONE_ 0   /* Take no special action */
+#define _REG0_ 1   /* Need to check for not empty st(0) */
+#define _REGI_ 2   /* Need to check for not empty st(0) and st(rm) */
+#define _REGi_ 0   /* Uses st(rm) */
+#define _PUSH_ 3   /* Need to check for space to push onto stack */
+#define _null_ 4   /* Function illegal or not implemented */
+#define _REGIi 5   /* Uses st(0) and st(rm), result to st(rm) */
+#define _REGIp 6   /* Uses st(0) and st(rm), result to st(rm) then pop */
+#define _REGIc 0   /* Compare st(0) and st(rm) */
+#define _REGIn 0   /* Uses st(0) and st(rm), but handle checks later */
+#ifndef NO_UNDOC_CODE
+/* Un-documented FPU op-codes supported by default. (see above) */
+static u_char const type_table[64] = {
+  _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_,
+  _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_,
+  _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
+  _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
+  _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
+  _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
+  _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
+  _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
+};
+#else     /* Support only documented FPU op-codes */
+static u_char const type_table[64] = {
+  _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_,
+  _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
+  _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_,
+  _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_,
+  _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
+  _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
+  _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
+  _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
+};
+#endif /* NO_UNDOC_CODE */
+#ifdef RE_ENTRANT_CHECKING
+u_char emulating=0;
+#endif /* RE_ENTRANT_CHECKING */
+static int valid_prefix(u_char *Byte, u_char __user **fpu_eip,
+                        overrides *override);
+asmlinkage void math_emulate(long arg)
+{
+  u_char  FPU_modrm, byte1;
+  unsigned short code;
+  fpu_addr_modes addr_modes;
+  int unmasked;
+  FPU_REG loaded_data;
+  FPU_REG *st0_ptr;
+  u_char          loaded_tag, st0_tag;
+  void __user *data_address;
+  struct address data_sel_off;
+  struct address entry_sel_off;
+  unsigned long code_base = 0;
+  unsigned long code_limit = 0;  /* Initialized to stop compiler warnings */
+  struct desc_struct code_descriptor;
+#ifdef RE_ENTRANT_CHECKING
+  if ( emulating )
+    {
+      printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n");
+    }
+  RE_ENTRANT_CHECK_ON;
+#endif /* RE_ENTRANT_CHECKING */
+  if (!used_math())
+    {
+      finit();
+      set_used_math();
+    }
+  SETUP_DATA_AREA(arg);
+  FPU_ORIG_EIP = FPU_EIP;
+  if ( (FPU_EFLAGS & 0x00020000) != 0 )
+    {
+      /* Virtual 8086 mode */
+      addr_modes.default_mode = VM86;
+      FPU_EIP += code_base = FPU_CS << 4;
+      code_limit = code_base + 0xffff;  /* Assumes code_base <= 0xffff0000 */
+    }
+  else if ( FPU_CS == __USER_CS && FPU_DS == __USER_DS )
+    {
+      addr_modes.default_mode = 0;
+    }
+  else if ( FPU_CS == __KERNEL_CS )
+    {
+      printk("math_emulate: %04x:%08lx\n",FPU_CS,FPU_EIP);
+      panic("Math emulation needed in kernel");
+    }
+  else
+    {
+      if ( (FPU_CS & 4) != 4 )   /* Must be in the LDT */
+        {
+          /* Can only handle segmented addressing via the LDT
+             for now, and it must be 16 bit */
+          printk("FPU emulator: Unsupported addressing mode\n");
+          math_abort(FPU_info, SIGILL);
+        }
+      code_descriptor = LDT_DESCRIPTOR(FPU_CS);
+      if ( SEG_D_SIZE(code_descriptor) )
+        {
+          /* The above test may be wrong, the book is not clear */
+          /* Segmented 32 bit protected mode */
+          addr_modes.default_mode = SEG32;
+        }
+      else
+        {
+          /* 16 bit protected mode */
+          addr_modes.default_mode = PM16;
+        }
+      FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor);
+      code_limit = code_base
+        + (SEG_LIMIT(code_descriptor)+1) * SEG_GRANULARITY(code_descriptor)
+          - 1;
+      if ( code_limit < code_base ) code_limit = 0xffffffff;
+    }
+  FPU_lookahead = 1;
+  if (current->ptrace & PT_PTRACED)
+    FPU_lookahead = 0;
+  if ( !valid_prefix(&byte1, (u_char __user **)&FPU_EIP,
+                     &addr_modes.override) )
+    {
+      RE_ENTRANT_CHECK_OFF;
+      printk("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n"
+             "FPU emulator: self-modifying code! (emulation impossible)\n",
+             byte1);
+      RE_ENTRANT_CHECK_ON;
+      EXCEPTION(EX_INTERNAL|0x126);
+      math_abort(FPU_info,SIGILL);
+    }
+do_another_FPU_instruction:
+  no_ip_update = 0;
+  FPU_EIP++;  /* We have fetched the prefix and first code bytes. */
+  if ( addr_modes.default_mode )
+    {
+      /* This checks for the minimum instruction bytes.
+         We also need to check any extra (address mode) code access. */
+      if ( FPU_EIP > code_limit )
+        math_abort(FPU_info,SIGSEGV);
+    }
+  if ( (byte1 & 0xf8) != 0xd8 )
+    {
+      if ( byte1 == FWAIT_OPCODE )
+        {
+          if (partial_status & SW_Summary)
+            goto do_the_FPU_interrupt;
+          else
+            goto FPU_fwait_done;
+        }
+#ifdef PARANOID
+      EXCEPTION(EX_INTERNAL|0x128);
+      math_abort(FPU_info,SIGILL);
+#endif /* PARANOID */
+    }
+  RE_ENTRANT_CHECK_OFF;
+  FPU_code_access_ok(1);
+  FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP);
+  RE_ENTRANT_CHECK_ON;
+  FPU_EIP++;
+  if (partial_status & SW_Summary)
+    {
+      /* Ignore the error for now if the current instruction is a no-wait
+         control instruction */
+      /* The 80486 manual contradicts itself on this topic,
+         but a real 80486 uses the following instructions:
+         fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex.
+       */
+      code = (FPU_modrm << 8) | byte1;
+      if ( ! ( (((code & 0xf803) == 0xe003) ||    /* fnclex, fninit, fnstsw */
+                (((code & 0x3003) == 0x3001) &&   /* fnsave, fnstcw, fnstenv,
+                                                     fnstsw */
+                 ((code & 0xc000) != 0xc000))) ) )
+        {
+          /*
+           *  We need to simulate the action of the kernel to FPU
+           *  interrupts here.
+           */
+        do_the_FPU_interrupt:
+          FPU_EIP = FPU_ORIG_EIP;       /* Point to current FPU instruction. */
+          RE_ENTRANT_CHECK_OFF;
+          current->thread.trap_no = 16;
+          current->thread.error_code = 0;
+          send_sig(SIGFPE, current, 1);
+          return;
+        }
+    }
+  entry_sel_off.offset = FPU_ORIG_EIP;
+  entry_sel_off.selector = FPU_CS;
+  entry_sel_off.opcode = (byte1 << 8) | FPU_modrm;
+  FPU_rm = FPU_modrm & 7;
+  if ( FPU_modrm < 0300 )
+    {
+      /* All of these instructions use the mod/rm byte to get a data address */
+      if ( (addr_modes.default_mode & SIXTEEN)
+          ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX) )
+        data_address = FPU_get_address_16(FPU_modrm, &FPU_EIP, &data_sel_off,
+                                          addr_modes);
+      else
+        data_address = FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off,
+                                       addr_modes);
+      if ( addr_modes.default_mode )
+        {
+          if ( FPU_EIP-1 > code_limit )
+            math_abort(FPU_info,SIGSEGV);
+        }
+      if ( !(byte1 & 1) )
+        {
+          unsigned short status1 = partial_status;
+          st0_ptr = &st(0);
+          st0_tag = FPU_gettag0();
+          /* Stack underflow has priority */
+          if ( NOT_EMPTY_ST0 )
+            {
+              if ( addr_modes.default_mode & PROTECTED )
+                {
+                  /* This table works for 16 and 32 bit protected mode */
+                  if ( access_limit < data_sizes_16[(byte1 >> 1) & 3] )
+                    math_abort(FPU_info,SIGSEGV);
+                }
+              unmasked = 0;  /* Do this here to stop compiler warnings. */
+              switch ( (byte1 >> 1) & 3 )
+                {
+                case 0:
+                  unmasked = FPU_load_single((float __user *)data_address,
+                                             &loaded_data);
+                  loaded_tag = unmasked & 0xff;
+                  unmasked &= ~0xff;
+                  break;
+                case 1:
+                  loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data);
+                  break;
+                case 2:
+                  unmasked = FPU_load_double((double __user *)data_address,
+                                             &loaded_data);
+                  loaded_tag = unmasked & 0xff;
+                  unmasked &= ~0xff;
+                  break;
+                case 3:
+                default:  /* Used here to suppress gcc warnings. */
+                  loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data);
+                  break;
+                }
+              /* No more access to user memory, it is safe
+                 to use static data now */
+              /* NaN operands have the next priority. */
+              /* We have to delay looking at st(0) until after
+                 loading the data, because that data might contain an SNaN */
+              if ( ((st0_tag == TAG_Special) && isNaN(st0_ptr)) ||
+                  ((loaded_tag == TAG_Special) && isNaN(&loaded_data)) )
+                {
+                  /* Restore the status word; we might have loaded a
+                     denormal. */
+                  partial_status = status1;
+                  if ( (FPU_modrm & 0x30) == 0x10 )
+                    {
+                      /* fcom or fcomp */
+                      EXCEPTION(EX_Invalid);
+                      setcc(SW_C3 | SW_C2 | SW_C0);
+                      if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) )
+                        FPU_pop();             /* fcomp, masked, so we pop. */
+                    }
+                  else
+                    {
+                      if ( loaded_tag == TAG_Special )
+                        loaded_tag = FPU_Special(&loaded_data);
+#ifdef PECULIAR_486
+                      /* This is not really needed, but gives behaviour
+                         identical to an 80486 */
+                      if ( (FPU_modrm & 0x28) == 0x20 )
+                        /* fdiv or fsub */
+                        real_2op_NaN(&loaded_data, loaded_tag, 0, &loaded_data);
+                      else
+#endif /* PECULIAR_486 */ 
+                        /* fadd, fdivr, fmul, or fsubr */
+                        real_2op_NaN(&loaded_data, loaded_tag, 0, st0_ptr);
+                    }
+                  goto reg_mem_instr_done;
+                }
+              if ( unmasked && !((FPU_modrm & 0x30) == 0x10) )
+                {
+                  /* Is not a comparison instruction. */
+                  if ( (FPU_modrm & 0x38) == 0x38 )
+                    {
+                      /* fdivr */
+                      if ( (st0_tag == TAG_Zero) &&
+                           ((loaded_tag == TAG_Valid)
+                            || (loaded_tag == TAG_Special
+                                && isdenormal(&loaded_data))) )
+                        {
+                          if ( FPU_divide_by_zero(0, getsign(&loaded_data))
+                               < 0 )
+                            {
+                              /* We use the fact here that the unmasked
+                                 exception in the loaded data was for a
+                                 denormal operand */
+                              /* Restore the state of the denormal op bit */
+                              partial_status &= ~SW_Denorm_Op;
+                              partial_status |= status1 & SW_Denorm_Op;
+                            }
+                          else
+                            setsign(st0_ptr, getsign(&loaded_data));
+                        }
+                    }
+                  goto reg_mem_instr_done;
+                }
+              switch ( (FPU_modrm >> 3) & 7 )
+                {
+                case 0:         /* fadd */
+                  clear_C1();
+                  FPU_add(&loaded_data, loaded_tag, 0, control_word);
+                  break;
+                case 1:         /* fmul */
+                  clear_C1();
+                  FPU_mul(&loaded_data, loaded_tag, 0, control_word);
+                  break;
+                case 2:         /* fcom */
+                  FPU_compare_st_data(&loaded_data, loaded_tag);
+                  break;
+                case 3:         /* fcomp */
+                  if ( !FPU_compare_st_data(&loaded_data, loaded_tag)
+                       && !unmasked )
+                    FPU_pop();
+                  break;
+                case 4:         /* fsub */
+                  clear_C1();
+                  FPU_sub(LOADED|loaded_tag, (int)&loaded_data, control_word);
+                  break;
+                case 5:         /* fsubr */
+                  clear_C1();
+                  FPU_sub(REV|LOADED|loaded_tag, (int)&loaded_data, control_word);
+                  break;
+                case 6:         /* fdiv */
+                  clear_C1();
+                  FPU_div(LOADED|loaded_tag, (int)&loaded_data, control_word);
+                  break;
+                case 7:         /* fdivr */
+                  clear_C1();
+                  if ( st0_tag == TAG_Zero )
+                    partial_status = status1;  /* Undo any denorm tag,
+                                                  zero-divide has priority. */
+                  FPU_div(REV|LOADED|loaded_tag, (int)&loaded_data, control_word);
+                  break;
+                }
+            }
+          else
+            {
+              if ( (FPU_modrm & 0x30) == 0x10 )
+                {
+                  /* The instruction is fcom or fcomp */
+                  EXCEPTION(EX_StackUnder);
+                  setcc(SW_C3 | SW_C2 | SW_C0);
+                  if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) )
+                    FPU_pop();             /* fcomp */
+                }
+              else
+                FPU_stack_underflow();
+            }
+        reg_mem_instr_done:
+          operand_address = data_sel_off;
+        }
+      else
+        {
+          if ( !(no_ip_update =
+                 FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6)) >> 1,
+                                addr_modes, data_address)) )
+            {
+              operand_address = data_sel_off;
+            }
+        }
+    }
+  else
+    {
+      /* None of these instructions access user memory */
+      u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7);
+#ifdef PECULIAR_486
+      /* This is supposed to be undefined, but a real 80486 seems
+         to do this: */
+      operand_address.offset = 0;
+      operand_address.selector = FPU_DS;
+#endif /* PECULIAR_486 */
+      st0_ptr = &st(0);
+      st0_tag = FPU_gettag0();
+      switch ( type_table[(int) instr_index] )
+        {
+        case _NONE_:   /* also _REGIc: _REGIn */
+          break;
+        case _REG0_:
+          if ( !NOT_EMPTY_ST0 )
+            {
+              FPU_stack_underflow();
+              goto FPU_instruction_done;
+            }
+          break;
+        case _REGIi:
+          if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) )
+            {
+              FPU_stack_underflow_i(FPU_rm);
+              goto FPU_instruction_done;
+            }
+          break;
+        case _REGIp:
+          if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) )
+            {
+              FPU_stack_underflow_pop(FPU_rm);
+              goto FPU_instruction_done;
+            }
+          break;
+        case _REGI_:
+          if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) )
+            {
+              FPU_stack_underflow();
+              goto FPU_instruction_done;
+            }
+          break;
+        case _PUSH_:     /* Only used by the fld st(i) instruction */
+          break;
+        case _null_:
+          FPU_illegal();
+          goto FPU_instruction_done;
+        default:
+          EXCEPTION(EX_INTERNAL|0x111);
+          goto FPU_instruction_done;
+        }
+      (*st_instr_table[(int) instr_index])();
+FPU_instruction_done:
+      ;
+    }
+  if ( ! no_ip_update )
+    instruction_address = entry_sel_off;
+FPU_fwait_done:
+#ifdef DEBUG
+  RE_ENTRANT_CHECK_OFF;
+  FPU_printall();
+  RE_ENTRANT_CHECK_ON;
+#endif /* DEBUG */
+  if (FPU_lookahead && !need_resched())
+    {
+      FPU_ORIG_EIP = FPU_EIP - code_base;
+      if ( valid_prefix(&byte1, (u_char __user **)&FPU_EIP,
+                        &addr_modes.override) )
+        goto do_another_FPU_instruction;
+    }
+  if ( addr_modes.default_mode )
+    FPU_EIP -= code_base;
+  RE_ENTRANT_CHECK_OFF;
+}
+/* Support for prefix bytes is not yet complete. To properly handle
+   all prefix bytes, further changes are needed in the emulator code
+   which accesses user address space. Access to separate segments is
+   important for msdos emulation. */
+static int valid_prefix(u_char *Byte, u_char __user **fpu_eip,
+                        overrides *override)
+{
+  u_char byte;
+  u_char __user *ip = *fpu_eip;
+  *override = (overrides) { 0, 0, PREFIX_DEFAULT };       /* defaults */
+  RE_ENTRANT_CHECK_OFF;
+  FPU_code_access_ok(1);
+  FPU_get_user(byte, ip);
+  RE_ENTRANT_CHECK_ON;
+  while ( 1 )
+    {
+      switch ( byte )
+        {
+        case ADDR_SIZE_PREFIX:
+          override->address_size = ADDR_SIZE_PREFIX;
+          goto do_next_byte;
+        case OP_SIZE_PREFIX:
+          override->operand_size = OP_SIZE_PREFIX;
+          goto do_next_byte;
+        case PREFIX_CS:
+          override->segment = PREFIX_CS_;
+          goto do_next_byte;
+        case PREFIX_ES:
+          override->segment = PREFIX_ES_;
+          goto do_next_byte;
+        case PREFIX_SS:
+          override->segment = PREFIX_SS_;
+          goto do_next_byte;
+        case PREFIX_FS:
+          override->segment = PREFIX_FS_;
+          goto do_next_byte;
+        case PREFIX_GS:
+          override->segment = PREFIX_GS_;
+          goto do_next_byte;
+        case PREFIX_DS:
+          override->segment = PREFIX_DS_;
+          goto do_next_byte;
+/* lock is not a valid prefix for FPU instructions,
+   let the cpu handle it to generate a SIGILL. */
+/*      case PREFIX_LOCK: */
+          /* rep.. prefixes have no meaning for FPU instructions */
+        case PREFIX_REPE:
+        case PREFIX_REPNE:
+        do_next_byte:
+          ip++;
+          RE_ENTRANT_CHECK_OFF;
+          FPU_code_access_ok(1);
+          FPU_get_user(byte, ip);
+          RE_ENTRANT_CHECK_ON;
+          break;
+        case FWAIT_OPCODE:
+          *Byte = byte;
+          return 1;
+        default:
+          if ( (byte & 0xf8) == 0xd8 )
+            {
+              *Byte = byte;
+              *fpu_eip = ip;
+              return 1;
+            }
+          else
+            {
+              /* Not a valid sequence of prefix bytes followed by
+                 an FPU instruction. */
+              *Byte = byte;  /* Needed for error message. */
+              return 0;
+            }
+        }
+    }
+}
+void math_abort(struct info * info, unsigned int signal)
+{
+        FPU_EIP = FPU_ORIG_EIP;
+        current->thread.trap_no = 16;
+        current->thread.error_code = 0;
+        send_sig(signal,current,1);
+        RE_ENTRANT_CHECK_OFF;
+        __asm__("movl %0,%%esp ; ret": :"g" (((long) info)-4));
+#ifdef PARANOID
+      printk("ERROR: wm-FPU-emu math_abort failed!\n");
+#endif /* PARANOID */
+}
+#define S387 ((struct i387_soft_struct *)s387)
+#define sstatus_word() \
+  ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top))
+int restore_i387_soft(void *s387, struct _fpstate __user *buf)
+{
+  u_char __user *d = (u_char __user *)buf;
+  int offset, other, i, tags, regnr, tag, newtop;
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_READ, d, 7*4 + 8*10);
+  if (__copy_from_user(&S387->cwd, d, 7*4))
+    return -1;
+  RE_ENTRANT_CHECK_ON;
+  d += 7*4;
+  S387->ftop = (S387->swd >> SW_Top_Shift) & 7;
+  offset = (S387->ftop & 7) * 10;
+  other = 80 - offset;
+  RE_ENTRANT_CHECK_OFF;
+  /* Copy all registers in stack order. */
+  if (__copy_from_user(((u_char *)&S387->st_space)+offset, d, other))
+    return -1;
+  if ( offset )
+    if (__copy_from_user((u_char *)&S387->st_space, d+other, offset))
+      return -1;
+  RE_ENTRANT_CHECK_ON;
+  /* The tags may need to be corrected now. */
+  tags = S387->twd;
+  newtop = S387->ftop;
+  for ( i = 0; i < 8; i++ )
+    {
+      regnr = (i+newtop) & 7;
+      if ( ((tags >> ((regnr & 7)*2)) & 3) != TAG_Empty )
+        {
+          /* The loaded data over-rides all other cases. */
+          tag = FPU_tagof((FPU_REG *)((u_char *)S387->st_space + 10*regnr));
+          tags &= ~(3 << (regnr*2));
+          tags |= (tag & 3) << (regnr*2);
+        }
+    }
+  S387->twd = tags;
+  return 0;
+}
+int save_i387_soft(void *s387, struct _fpstate __user * buf)
+{
+  u_char __user *d = (u_char __user *)buf;
+  int offset = (S387->ftop & 7) * 10, other = 80 - offset;
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_WRITE, d, 7*4 + 8*10);
+#ifdef PECULIAR_486
+  S387->cwd &= ~0xe080;
+  /* An 80486 sets nearly all of the reserved bits to 1. */
+  S387->cwd |= 0xffff0040;
+  S387->swd = sstatus_word() | 0xffff0000;
+  S387->twd |= 0xffff0000;
+  S387->fcs &= ~0xf8000000;
+  S387->fos |= 0xffff0000;
+#endif /* PECULIAR_486 */
+  if (__copy_to_user(d, &S387->cwd, 7*4))
+    return -1;
+  RE_ENTRANT_CHECK_ON;
+  d += 7*4;
+  RE_ENTRANT_CHECK_OFF;
+  /* Copy all registers in stack order. */
+  if (__copy_to_user(d, ((u_char *)&S387->st_space)+offset, other))
+    return -1;
+  if ( offset )
+    if (__copy_to_user(d+other, (u_char *)&S387->st_space, offset))
+      return -1;
+  RE_ENTRANT_CHECK_ON;
+  return 1;
+}
diff --git a/arch/x86/math-emu/fpu_etc.c b/arch/x86/math-emu/fpu_etc.c
new file mode 100644
index 000000000000..e3b5d465587f
--- /dev/null
+++ b/arch/x86/math-emu/fpu_etc.c
@@ -0,0 +1,143 @@
+/*---------------------------------------------------------------------------+
+ |  fpu_etc.c                                                                |
+ |                                                                           |
+ | Implement a few FPU instructions.                                         |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1997                                         |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail   billm@suburbia.net             |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "fpu_system.h"
+#include "exception.h"
+#include "fpu_emu.h"
+#include "status_w.h"
+#include "reg_constant.h"
+static void fchs(FPU_REG *st0_ptr, u_char st0tag)
+{
+  if ( st0tag ^ TAG_Empty )
+    {
+      signbyte(st0_ptr) ^= SIGN_NEG;
+      clear_C1();
+    }
+  else
+    FPU_stack_underflow();
+}
+static void fabs(FPU_REG *st0_ptr, u_char st0tag)
+{
+  if ( st0tag ^ TAG_Empty )
+    {
+      setpositive(st0_ptr);
+      clear_C1();
+    }
+  else
+    FPU_stack_underflow();
+}
+static void ftst_(FPU_REG *st0_ptr, u_char st0tag)
+{
+  switch (st0tag)
+    {
+    case TAG_Zero:
+      setcc(SW_C3);
+      break;
+    case TAG_Valid:
+      if (getsign(st0_ptr) == SIGN_POS)
+        setcc(0);
+      else
+        setcc(SW_C0);
+      break;
+    case TAG_Special:
+      switch ( FPU_Special(st0_ptr) )
+        {
+        case TW_Denormal:
+          if (getsign(st0_ptr) == SIGN_POS)
+            setcc(0);
+          else
+            setcc(SW_C0);
+          if ( denormal_operand() < 0 )
+            {
+#ifdef PECULIAR_486
+              /* This is weird! */
+              if (getsign(st0_ptr) == SIGN_POS)
+                setcc(SW_C3);
+#endif /* PECULIAR_486 */
+              return;
+            }
+          break;
+        case TW_NaN:
+          setcc(SW_C0|SW_C2|SW_C3);   /* Operand is not comparable */ 
+          EXCEPTION(EX_Invalid);
+          break;
+        case TW_Infinity:
+          if (getsign(st0_ptr) == SIGN_POS)
+            setcc(0);
+          else
+            setcc(SW_C0);
+          break;
+        default:
+          setcc(SW_C0|SW_C2|SW_C3);   /* Operand is not comparable */ 
+          EXCEPTION(EX_INTERNAL|0x14);
+          break;
+        }
+      break;
+    case TAG_Empty:
+      setcc(SW_C0|SW_C2|SW_C3);
+      EXCEPTION(EX_StackUnder);
+      break;
+    }
+}
+static void fxam(FPU_REG *st0_ptr, u_char st0tag)
+{
+  int c = 0;
+  switch (st0tag)
+    {
+    case TAG_Empty:
+      c = SW_C3|SW_C0;
+      break;
+    case TAG_Zero:
+      c = SW_C3;
+      break;
+    case TAG_Valid:
+      c = SW_C2;
+      break;
+    case TAG_Special:
+      switch ( FPU_Special(st0_ptr) )
+        {
+        case TW_Denormal:
+          c = SW_C2|SW_C3;  /* Denormal */
+          break;
+        case TW_NaN:
+          /* We also use NaN for unsupported types. */
+          if ( (st0_ptr->sigh & 0x80000000) && (exponent(st0_ptr) == EXP_OVER) )
+            c = SW_C0;
+          break;
+        case TW_Infinity:
+          c = SW_C2|SW_C0;
+          break;
+        }
+    }
+  if ( getsign(st0_ptr) == SIGN_NEG )
+    c |= SW_C1;
+  setcc(c);
+}
+static FUNC_ST0 const fp_etc_table[] = {
+  fchs, fabs, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal,
+  ftst_, fxam, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal
+};
+void FPU_etc(void)
+{
+  (fp_etc_table[FPU_rm])(&st(0), FPU_gettag0());
+}
diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h
new file mode 100644
index 000000000000..37a8a7fe7e2b
--- /dev/null
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -0,0 +1,140 @@
+#ifndef _FPU_PROTO_H
+#define _FPU_PROTO_H
+/* errors.c */
+extern void FPU_illegal(void);
+extern void FPU_printall(void);
+asmlinkage void FPU_exception(int n);
+extern int real_1op_NaN(FPU_REG *a);
+extern int real_2op_NaN(FPU_REG const *b, u_char tagb, int deststnr,
+                        FPU_REG const *defaultNaN);
+asmlinkage int arith_invalid(int deststnr);
+asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign);
+extern int set_precision_flag(int flags);
+asmlinkage void set_precision_flag_up(void);
+asmlinkage void set_precision_flag_down(void);
+asmlinkage int denormal_operand(void);
+asmlinkage int arith_overflow(FPU_REG *dest);
+asmlinkage int arith_underflow(FPU_REG *dest);
+extern void FPU_stack_overflow(void);
+extern void FPU_stack_underflow(void);
+extern void FPU_stack_underflow_i(int i);
+extern void FPU_stack_underflow_pop(int i);
+/* fpu_arith.c */
+extern void fadd__(void);
+extern void fmul__(void);
+extern void fsub__(void);
+extern void fsubr_(void);
+extern void fdiv__(void);
+extern void fdivr_(void);
+extern void fadd_i(void);
+extern void fmul_i(void);
+extern void fsubri(void);
+extern void fsub_i(void);
+extern void fdivri(void);
+extern void fdiv_i(void);
+extern void faddp_(void);
+extern void fmulp_(void);
+extern void fsubrp(void);
+extern void fsubp_(void);
+extern void fdivrp(void);
+extern void fdivp_(void);
+/* fpu_aux.c */
+extern void finit(void);
+extern void finit_(void);
+extern void fstsw_(void);
+extern void fp_nop(void);
+extern void fld_i_(void);
+extern void fxch_i(void);
+extern void ffree_(void);
+extern void ffreep(void);
+extern void fst_i_(void);
+extern void fstp_i(void);
+/* fpu_entry.c */
+asmlinkage extern void math_emulate(long arg);
+extern void math_abort(struct info *info, unsigned int signal);
+/* fpu_etc.c */
+extern void FPU_etc(void);
+/* fpu_tags.c */
+extern int FPU_gettag0(void);
+extern int FPU_gettagi(int stnr);
+extern int FPU_gettag(int regnr);
+extern void FPU_settag0(int tag);
+extern void FPU_settagi(int stnr, int tag);
+extern void FPU_settag(int regnr, int tag);
+extern int FPU_Special(FPU_REG const *ptr);
+extern int isNaN(FPU_REG const *ptr);
+extern void FPU_pop(void);
+extern int FPU_empty_i(int stnr);
+extern int FPU_stackoverflow(FPU_REG **st_new_ptr);
+extern void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr);
+extern void FPU_copy_to_reg1(FPU_REG const *r, u_char tag);
+extern void FPU_copy_to_reg0(FPU_REG const *r, u_char tag);
+/* fpu_trig.c */
+extern void FPU_triga(void);
+extern void FPU_trigb(void);
+/* get_address.c */
+extern void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip,
+                         struct address *addr, fpu_addr_modes addr_modes);
+extern void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
+                            struct address *addr, fpu_addr_modes addr_modes);
+/* load_store.c */
+extern int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
+                            void __user *data_address);
+/* poly_2xm1.c */
+extern int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result);
+/* poly_atan.c */
+extern void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, FPU_REG *st1_ptr,
+                      u_char st1_tag);
+/* poly_l2.c */
+extern void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign);
+extern int poly_l2p1(u_char s0, u_char s1, FPU_REG *r0, FPU_REG *r1,
+                     FPU_REG *d);
+/* poly_sin.c */
+extern void poly_sine(FPU_REG *st0_ptr);
+extern void poly_cos(FPU_REG *st0_ptr);
+/* poly_tan.c */
+extern void poly_tan(FPU_REG *st0_ptr);
+/* reg_add_sub.c */
+extern int FPU_add(FPU_REG const *b, u_char tagb, int destrnr, int control_w);
+extern int FPU_sub(int flags, int rm, int control_w);
+/* reg_compare.c */
+extern int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag);
+extern void fcom_st(void);
+extern void fcompst(void);
+extern void fcompp(void);
+extern void fucom_(void);
+extern void fucomp(void);
+extern void fucompp(void);
+/* reg_constant.c */
+extern void fconst(void);
+/* reg_ld_str.c */
+extern int FPU_load_extended(long double __user *s, int stnr);
+extern int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data);
+extern int FPU_load_single(float __user *single, FPU_REG *loaded_data);
+extern int FPU_load_int64(long long __user *_s);
+extern int FPU_load_int32(long __user *_s, FPU_REG *loaded_data);
+extern int FPU_load_int16(short __user *_s, FPU_REG *loaded_data);
+extern int FPU_load_bcd(u_char __user *s);
+extern int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag,
+                              long double __user *d);
+extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat);
+extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single);
+extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d);
+extern int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d);
+extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d);
+extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d);
+extern int FPU_round_to_int(FPU_REG *r, u_char tag);
+extern u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s);
+extern void frstor(fpu_addr_modes addr_modes, u_char __user *data_address);
+extern u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d);
+extern void fsave(fpu_addr_modes addr_modes, u_char __user *data_address);
+extern int FPU_tagof(FPU_REG *ptr);
+/* reg_mul.c */
+extern int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w);
+extern int FPU_div(int flags, int regrm, int control_w);
+/* reg_convert.c */
+extern int FPU_to_exp16(FPU_REG const *a, FPU_REG *x);
+#endif /* _FPU_PROTO_H */
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
new file mode 100644
index 000000000000..a3ae28c49ddd
--- /dev/null
+++ b/arch/x86/math-emu/fpu_system.h
@@ -0,0 +1,90 @@
+/*---------------------------------------------------------------------------+
+ |  fpu_system.h                                                             |
+ |                                                                           |
+ | Copyright (C) 1992,1994,1997                                              |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail   billm@suburbia.net             |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#ifndef _FPU_SYSTEM_H
+#define _FPU_SYSTEM_H
+/* system dependent definitions */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+/* This sets the pointer FPU_info to point to the argument part
+   of the stack frame of math_emulate() */
+#define SETUP_DATA_AREA(arg)    FPU_info = (struct info *) &arg
+/* s is always from a cpu register, and the cpu does bounds checking
+ * during register load --> no further bounds checks needed */
+#define LDT_DESCRIPTOR(s)       (((struct desc_struct *)current->mm->context.ldt)[(s) >> 3])
+#define SEG_D_SIZE(x)           ((x).b & (3 << 21))
+#define SEG_G_BIT(x)            ((x).b & (1 << 23))
+#define SEG_GRANULARITY(x)      (((x).b & (1 << 23)) ? 4096 : 1)
+#define SEG_286_MODE(x)         ((x).b & ( 0xff000000 | 0xf0000 | (1 << 23)))
+#define SEG_BASE_ADDR(s)        (((s).b & 0xff000000) \
+                                 | (((s).b & 0xff) << 16) | ((s).a >> 16))
+#define SEG_LIMIT(s)            (((s).b & 0xff0000) | ((s).a & 0xffff))
+#define SEG_EXECUTE_ONLY(s)     (((s).b & ((1 << 11) | (1 << 9))) == (1 << 11))
+#define SEG_WRITE_PERM(s)       (((s).b & ((1 << 11) | (1 << 9))) == (1 << 9))
+#define SEG_EXPAND_DOWN(s)      (((s).b & ((1 << 11) | (1 << 10))) \
+                                 == (1 << 10))
+#define I387                    (current->thread.i387)
+#define FPU_info                (I387.soft.info)
+#define FPU_CS                  (*(unsigned short *) &(FPU_info->___cs))
+#define FPU_SS                  (*(unsigned short *) &(FPU_info->___ss))
+#define FPU_DS                  (*(unsigned short *) &(FPU_info->___ds))
+#define FPU_EAX                 (FPU_info->___eax)
+#define FPU_EFLAGS              (FPU_info->___eflags)
+#define FPU_EIP                 (FPU_info->___eip)
+#define FPU_ORIG_EIP            (FPU_info->___orig_eip)
+#define FPU_lookahead           (I387.soft.lookahead)
+/* nz if ip_offset and cs_selector are not to be set for the current
+   instruction. */
+#define no_ip_update            (*(u_char *)&(I387.soft.no_update))
+#define FPU_rm                  (*(u_char *)&(I387.soft.rm))
+/* Number of bytes of data which can be legally accessed by the current
+   instruction. This only needs to hold a number <= 108, so a byte will do. */
+#define access_limit            (*(u_char *)&(I387.soft.alimit))
+#define partial_status          (I387.soft.swd)
+#define control_word            (I387.soft.cwd)
+#define fpu_tag_word            (I387.soft.twd)
+#define registers               (I387.soft.st_space)
+#define top                     (I387.soft.ftop)
+#define instruction_address     (*(struct address *)&I387.soft.fip)
+#define operand_address         (*(struct address *)&I387.soft.foo)
+#define FPU_access_ok(x,y,z)    if ( !access_ok(x,y,z) ) \
+                                math_abort(FPU_info,SIGSEGV)
+#define FPU_abort               math_abort(FPU_info, SIGSEGV)
+#undef FPU_IGNORE_CODE_SEGV
+#ifdef FPU_IGNORE_CODE_SEGV
+/* access_ok() is very expensive, and causes the emulator to run
+   about 20% slower if applied to the code. Anyway, errors due to bad
+   code addresses should be much rarer than errors due to bad data
+   addresses. */
+#define FPU_code_access_ok(z)
+#else
+/* A simpler test than access_ok() can probably be done for
+   FPU_code_access_ok() because the only possible error is to step
+   past the upper boundary of a legal code area. */
+#define FPU_code_access_ok(z) FPU_access_ok(VERIFY_READ,(void __user *)FPU_EIP,z)
+#endif
+#define FPU_get_user(x,y)       get_user((x),(y))
+#define FPU_put_user(x,y)       put_user((x),(y))
+#endif
diff --git a/arch/x86/math-emu/fpu_tags.c b/arch/x86/math-emu/fpu_tags.c
new file mode 100644
index 000000000000..cb436fe20e4c
--- /dev/null
+++ b/arch/x86/math-emu/fpu_tags.c
@@ -0,0 +1,127 @@
+/*---------------------------------------------------------------------------+
+ |  fpu_tags.c                                                               |
+ |                                                                           |
+ |  Set FPU register tags.                                                   |
+ |                                                                           |
+ | Copyright (C) 1997                                                        |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@jacobi.maths.monash.edu.au                |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "exception.h"
+void FPU_pop(void)
+{
+  fpu_tag_word |= 3 << ((top & 7)*2);
+  top++;
+}
+int FPU_gettag0(void)
+{
+  return (fpu_tag_word >> ((top & 7)*2)) & 3;
+}
+int FPU_gettagi(int stnr)
+{
+  return (fpu_tag_word >> (((top+stnr) & 7)*2)) & 3;
+}
+int FPU_gettag(int regnr)
+{
+  return (fpu_tag_word >> ((regnr & 7)*2)) & 3;
+}
+void FPU_settag0(int tag)
+{
+  int regnr = top;
+  regnr &= 7;
+  fpu_tag_word &= ~(3 << (regnr*2));
+  fpu_tag_word |= (tag & 3) << (regnr*2);
+}
+void FPU_settagi(int stnr, int tag)
+{
+  int regnr = stnr+top;
+  regnr &= 7;
+  fpu_tag_word &= ~(3 << (regnr*2));
+  fpu_tag_word |= (tag & 3) << (regnr*2);
+}
+void FPU_settag(int regnr, int tag)
+{
+  regnr &= 7;
+  fpu_tag_word &= ~(3 << (regnr*2));
+  fpu_tag_word |= (tag & 3) << (regnr*2);
+}
+int FPU_Special(FPU_REG const *ptr)
+{
+  int exp = exponent(ptr);
+  if ( exp == EXP_BIAS+EXP_UNDER )
+    return TW_Denormal;
+  else if ( exp != EXP_BIAS+EXP_OVER )
+    return TW_NaN;
+  else if ( (ptr->sigh == 0x80000000) && (ptr->sigl == 0) )
+    return TW_Infinity;
+  return TW_NaN;
+}
+int isNaN(FPU_REG const *ptr)
+{
+  return ( (exponent(ptr) == EXP_BIAS+EXP_OVER)
+           && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0)) );
+}
+int FPU_empty_i(int stnr)
+{
+  int regnr = (top+stnr) & 7;
+  return ((fpu_tag_word >> (regnr*2)) & 3) == TAG_Empty;
+}
+int FPU_stackoverflow(FPU_REG **st_new_ptr)
+{
+  *st_new_ptr = &st(-1);
+  return ((fpu_tag_word >> (((top - 1) & 7)*2)) & 3) != TAG_Empty;
+}
+void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr)
+{
+  reg_copy(r, &st(stnr));
+  FPU_settagi(stnr, tag);
+}
+void FPU_copy_to_reg1(FPU_REG const *r, u_char tag)
+{
+  reg_copy(r, &st(1));
+  FPU_settagi(1, tag);
+}
+void FPU_copy_to_reg0(FPU_REG const *r, u_char tag)
+{
+  int regnr = top;
+  regnr &= 7;
+  reg_copy(r, &st(0));
+  fpu_tag_word &= ~(3 << (regnr*2));
+  fpu_tag_word |= (tag & 3) << (regnr*2);
+}
diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c
new file mode 100644
index 000000000000..403cbde1d425
--- /dev/null
+++ b/arch/x86/math-emu/fpu_trig.c
@@ -0,0 +1,1845 @@
+/*---------------------------------------------------------------------------+
+ |  fpu_trig.c                                                               |
+ |                                                                           |
+ | Implementation of the FPU "transcendental" functions.                     |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1997,1999                                    |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail   billm@melbpc.org.au            |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "fpu_system.h"
+#include "exception.h"
+#include "fpu_emu.h"
+#include "status_w.h"
+#include "control_w.h"
+#include "reg_constant.h"       
+static void rem_kernel(unsigned long long st0, unsigned long long *y,
+                       unsigned long long st1,
+                       unsigned long long q, int n);
+#define BETTER_THAN_486
+#define FCOS  4
+/* Used only by fptan, fsin, fcos, and fsincos. */
+/* This routine produces very accurate results, similar to
+   using a value of pi with more than 128 bits precision. */
+/* Limited measurements show no results worse than 64 bit precision
+   except for the results for arguments close to 2^63, where the
+   precision of the result sometimes degrades to about 63.9 bits */
+static int trig_arg(FPU_REG *st0_ptr, int even)
+{
+  FPU_REG tmp;
+  u_char tmptag;
+  unsigned long long q;
+  int old_cw = control_word, saved_status = partial_status;
+  int tag, st0_tag = TAG_Valid;
+  if ( exponent(st0_ptr) >= 63 )
+    {
+      partial_status |= SW_C2;     /* Reduction incomplete. */
+      return -1;
+    }
+  control_word &= ~CW_RC;
+  control_word |= RC_CHOP;
+  setpositive(st0_ptr);
+  tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f,
+                  SIGN_POS);
+  FPU_round_to_int(&tmp, tag);  /* Fortunately, this can't overflow
+                                   to 2^64 */
+  q = significand(&tmp);
+  if ( q )
+    {
+      rem_kernel(significand(st0_ptr),
+                 &significand(&tmp),
+                 significand(&CONST_PI2),
+                 q, exponent(st0_ptr) - exponent(&CONST_PI2));
+      setexponent16(&tmp, exponent(&CONST_PI2));
+      st0_tag = FPU_normalize(&tmp);
+      FPU_copy_to_reg0(&tmp, st0_tag);
+    }
+  if ( (even && !(q & 1)) || (!even && (q & 1)) )
+    {
+      st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2, FULL_PRECISION);
+#ifdef BETTER_THAN_486
+      /* So far, the results are exact but based upon a 64 bit
+         precision approximation to pi/2. The technique used
+         now is equivalent to using an approximation to pi/2 which
+         is accurate to about 128 bits. */
+      if ( (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64) || (q > 1) )
+        {
+          /* This code gives the effect of having pi/2 to better than
+             128 bits precision. */
+          significand(&tmp) = q + 1;
+          setexponent16(&tmp, 63);
+          FPU_normalize(&tmp);
+          tmptag =
+            FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION, SIGN_POS,
+                      exponent(&CONST_PI2extra) + exponent(&tmp));
+          setsign(&tmp, getsign(&CONST_PI2extra));
+          st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION);
+          if ( signnegative(st0_ptr) )
+            {
+              /* CONST_PI2extra is negative, so the result of the addition
+                 can be negative. This means that the argument is actually
+                 in a different quadrant. The correction is always < pi/2,
+                 so it can't overflow into yet another quadrant. */
+              setpositive(st0_ptr);
+              q++;
+            }
+        }
+#endif /* BETTER_THAN_486 */
+    }
+#ifdef BETTER_THAN_486
+  else
+    {
+      /* So far, the results are exact but based upon a 64 bit
+         precision approximation to pi/2. The technique used
+         now is equivalent to using an approximation to pi/2 which
+         is accurate to about 128 bits. */
+      if ( ((q > 0) && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64))
+           || (q > 1) )
+        {
+          /* This code gives the effect of having p/2 to better than
+             128 bits precision. */
+          significand(&tmp) = q;
+          setexponent16(&tmp, 63);
+          FPU_normalize(&tmp);         /* This must return TAG_Valid */
+          tmptag = FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION,
+                             SIGN_POS,
+                             exponent(&CONST_PI2extra) + exponent(&tmp));
+          setsign(&tmp, getsign(&CONST_PI2extra));
+          st0_tag = FPU_sub(LOADED|(tmptag & 0x0f), (int)&tmp,
+                            FULL_PRECISION);
+          if ( (exponent(st0_ptr) == exponent(&CONST_PI2)) &&
+              ((st0_ptr->sigh > CONST_PI2.sigh)
+               || ((st0_ptr->sigh == CONST_PI2.sigh)
+                   && (st0_ptr->sigl > CONST_PI2.sigl))) )
+            {
+              /* CONST_PI2extra is negative, so the result of the
+                 subtraction can be larger than pi/2. This means
+                 that the argument is actually in a different quadrant.
+                 The correction is always < pi/2, so it can't overflow
+                 into yet another quadrant. */
+              st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2,
+                                FULL_PRECISION);
+              q++;
+            }
+        }
+    }
+#endif /* BETTER_THAN_486 */
+  FPU_settag0(st0_tag);
+  control_word = old_cw;
+  partial_status = saved_status & ~SW_C2;     /* Reduction complete. */
+  return (q & 3) | even;
+}
+/* Convert a long to register */
+static void convert_l2reg(long const *arg, int deststnr)
+{
+  int tag;
+  long num = *arg;
+  u_char sign;
+  FPU_REG *dest = &st(deststnr);
+  if (num == 0)
+    {
+      FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+      return;
+    }
+  if (num > 0)
+    { sign = SIGN_POS; }
+  else
+    { num = -num; sign = SIGN_NEG; }
+  dest->sigh = num;
+  dest->sigl = 0;
+  setexponent16(dest, 31);
+  tag = FPU_normalize(dest);
+  FPU_settagi(deststnr, tag);
+  setsign(dest, sign);
+  return;
+}
+static void single_arg_error(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  if ( st0_tag == TAG_Empty )
+    FPU_stack_underflow();  /* Puts a QNaN in st(0) */
+  else if ( st0_tag == TW_NaN )
+    real_1op_NaN(st0_ptr);       /* return with a NaN in st(0) */
+#ifdef PARANOID
+  else
+    EXCEPTION(EX_INTERNAL|0x0112);
+#endif /* PARANOID */
+}
+static void single_arg_2_error(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  int isNaN;
+  switch ( st0_tag )
+    {
+    case TW_NaN:
+      isNaN = (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000);
+      if ( isNaN && !(st0_ptr->sigh & 0x40000000) )   /* Signaling ? */
+        {
+          EXCEPTION(EX_Invalid);
+          if ( control_word & CW_Invalid )
+            {
+              /* The masked response */
+              /* Convert to a QNaN */
+              st0_ptr->sigh |= 0x40000000;
+              push();
+              FPU_copy_to_reg0(st0_ptr, TAG_Special);
+            }
+        }
+      else if ( isNaN )
+        {
+          /* A QNaN */
+          push();
+          FPU_copy_to_reg0(st0_ptr, TAG_Special);
+        }
+      else
+        {
+          /* pseudoNaN or other unsupported */
+          EXCEPTION(EX_Invalid);
+          if ( control_word & CW_Invalid )
+            {
+              /* The masked response */
+              FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+              push();
+              FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+            }
+        }
+      break;              /* return with a NaN in st(0) */
+#ifdef PARANOID
+    default:
+      EXCEPTION(EX_INTERNAL|0x0112);
+#endif /* PARANOID */
+    }
+}
+/*---------------------------------------------------------------------------*/
+static void f2xm1(FPU_REG *st0_ptr, u_char tag)
+{
+  FPU_REG a;
+  clear_C1();
+  if ( tag == TAG_Valid )
+    {
+      /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */
+      if ( exponent(st0_ptr) < 0 )
+        {
+        denormal_arg:
+          FPU_to_exp16(st0_ptr, &a);
+          /* poly_2xm1(x) requires 0 < st(0) < 1. */
+          poly_2xm1(getsign(st0_ptr), &a, st0_ptr);
+        }
+      set_precision_flag_up();   /* 80486 appears to always do this */
+      return;
+    }
+  if ( tag == TAG_Zero )
+    return;
+  if ( tag == TAG_Special )
+    tag = FPU_Special(st0_ptr);
+  switch ( tag )
+    {
+    case TW_Denormal:
+      if ( denormal_operand() < 0 )
+        return;
+      goto denormal_arg;
+    case TW_Infinity:
+      if ( signnegative(st0_ptr) )
+        {
+          /* -infinity gives -1 (p16-10) */
+          FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+          setnegative(st0_ptr);
+        }
+      return;
+    default:
+      single_arg_error(st0_ptr, tag);
+    }
+}
+static void fptan(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  FPU_REG *st_new_ptr;
+  int q;
+  u_char arg_sign = getsign(st0_ptr);
+  /* Stack underflow has higher priority */
+  if ( st0_tag == TAG_Empty )
+    {
+      FPU_stack_underflow();  /* Puts a QNaN in st(0) */
+      if ( control_word & CW_Invalid )
+        {
+          st_new_ptr = &st(-1);
+          push();
+          FPU_stack_underflow();  /* Puts a QNaN in the new st(0) */
+        }
+      return;
+    }
+  if ( STACK_OVERFLOW )
+    { FPU_stack_overflow(); return; }
+  if ( st0_tag == TAG_Valid )
+    {
+      if ( exponent(st0_ptr) > -40 )
+        {
+          if ( (q = trig_arg(st0_ptr, 0)) == -1 )
+            {
+              /* Operand is out of range */
+              return;
+            }
+          poly_tan(st0_ptr);
+          setsign(st0_ptr, (q & 1) ^ (arg_sign != 0));
+          set_precision_flag_up();  /* We do not really know if up or down */
+        }
+      else
+        {
+          /* For a small arg, the result == the argument */
+          /* Underflow may happen */
+        denormal_arg:
+          FPU_to_exp16(st0_ptr, st0_ptr);
+      
+          st0_tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
+          FPU_settag0(st0_tag);
+        }
+      push();
+      FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+      return;
+    }
+  if ( st0_tag == TAG_Zero )
+    {
+      push();
+      FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+      setcc(0);
+      return;
+    }
+  if ( st0_tag == TAG_Special )
+    st0_tag = FPU_Special(st0_ptr);
+  if ( st0_tag == TW_Denormal )
+    {
+      if ( denormal_operand() < 0 )
+        return;
+      goto denormal_arg;
+    }
+  if ( st0_tag == TW_Infinity )
+    {
+      /* The 80486 treats infinity as an invalid operand */
+      if ( arith_invalid(0) >= 0 )
+        {
+          st_new_ptr = &st(-1);
+          push();
+          arith_invalid(0);
+        }
+      return;
+    }
+  single_arg_2_error(st0_ptr, st0_tag);
+}
+static void fxtract(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  FPU_REG *st_new_ptr;
+  u_char sign;
+  register FPU_REG *st1_ptr = st0_ptr;  /* anticipate */
+  if ( STACK_OVERFLOW )
+    {  FPU_stack_overflow(); return; }
+  clear_C1();
+  if ( st0_tag == TAG_Valid )
+    {
+      long e;
+      push();
+      sign = getsign(st1_ptr);
+      reg_copy(st1_ptr, st_new_ptr);
+      setexponent16(st_new_ptr, exponent(st_new_ptr));
+    denormal_arg:
+      e = exponent16(st_new_ptr);
+      convert_l2reg(&e, 1);
+      setexponentpos(st_new_ptr, 0);
+      setsign(st_new_ptr, sign);
+      FPU_settag0(TAG_Valid);       /* Needed if arg was a denormal */
+      return;
+    }
+  else if ( st0_tag == TAG_Zero )
+    {
+      sign = getsign(st0_ptr);
+      if ( FPU_divide_by_zero(0, SIGN_NEG) < 0 )
+        return;
+      push();
+      FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
+      setsign(st_new_ptr, sign);
+      return;
+    }
+  if ( st0_tag == TAG_Special )
+    st0_tag = FPU_Special(st0_ptr);
+  if ( st0_tag == TW_Denormal )
+    {
+      if (denormal_operand() < 0 )
+        return;
+      push();
+      sign = getsign(st1_ptr);
+      FPU_to_exp16(st1_ptr, st_new_ptr);
+      goto denormal_arg;
+    }
+  else if ( st0_tag == TW_Infinity )
+    {
+      sign = getsign(st0_ptr);
+      setpositive(st0_ptr);
+      push();
+      FPU_copy_to_reg0(&CONST_INF, TAG_Special);
+      setsign(st_new_ptr, sign);
+      return;
+    }
+  else if ( st0_tag == TW_NaN )
+    {
+      if ( real_1op_NaN(st0_ptr) < 0 )
+        return;
+      push();
+      FPU_copy_to_reg0(st0_ptr, TAG_Special);
+      return;
+    }
+  else if ( st0_tag == TAG_Empty )
+    {
+      /* Is this the correct behaviour? */
+      if ( control_word & EX_Invalid )
+        {
+          FPU_stack_underflow();
+          push();
+          FPU_stack_underflow();
+        }
+      else
+        EXCEPTION(EX_StackUnder);
+    }
+#ifdef PARANOID
+  else
+    EXCEPTION(EX_INTERNAL | 0x119);
+#endif /* PARANOID */
+}
+static void fdecstp(void)
+{
+  clear_C1();
+  top--;
+}
+static void fincstp(void)
+{
+  clear_C1();
+  top++;
+}
+static void fsqrt_(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  int expon;
+  clear_C1();
+  if ( st0_tag == TAG_Valid )
+    {
+      u_char tag;
+      
+      if (signnegative(st0_ptr))
+        {
+          arith_invalid(0);  /* sqrt(negative) is invalid */
+          return;
+        }
+      /* make st(0) in  [1.0 .. 4.0) */
+      expon = exponent(st0_ptr);
+    denormal_arg:
+      setexponent16(st0_ptr, (expon & 1));
+      /* Do the computation, the sign of the result will be positive. */
+      tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS);
+      addexponent(st0_ptr, expon >> 1);
+      FPU_settag0(tag);
+      return;
+    }
+  if ( st0_tag == TAG_Zero )
+    return;
+  if ( st0_tag == TAG_Special )
+    st0_tag = FPU_Special(st0_ptr);
+  if ( st0_tag == TW_Infinity )
+    {
+      if ( signnegative(st0_ptr) )
+        arith_invalid(0);  /* sqrt(-Infinity) is invalid */
+      return;
+    }
+  else if ( st0_tag == TW_Denormal )
+    {
+      if (signnegative(st0_ptr))
+        {
+          arith_invalid(0);  /* sqrt(negative) is invalid */
+          return;
+        }
+      if ( denormal_operand() < 0 )
+        return;
+      FPU_to_exp16(st0_ptr, st0_ptr);
+      expon = exponent16(st0_ptr);
+      goto denormal_arg;
+    }
+  single_arg_error(st0_ptr, st0_tag);
+}
+static void frndint_(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  int flags, tag;
+  if ( st0_tag == TAG_Valid )
+    {
+      u_char sign;
+    denormal_arg:
+      sign = getsign(st0_ptr);
+      if (exponent(st0_ptr) > 63)
+        return;
+      if ( st0_tag == TW_Denormal )
+        {
+          if (denormal_operand() < 0 )
+            return;
+        }
+      /* Fortunately, this can't overflow to 2^64 */
+      if ( (flags = FPU_round_to_int(st0_ptr, st0_tag)) )
+        set_precision_flag(flags);
+      setexponent16(st0_ptr, 63);
+      tag = FPU_normalize(st0_ptr);
+      setsign(st0_ptr, sign);
+      FPU_settag0(tag);
+      return;
+    }
+  if ( st0_tag == TAG_Zero )
+    return;
+  if ( st0_tag == TAG_Special )
+    st0_tag = FPU_Special(st0_ptr);
+  if ( st0_tag == TW_Denormal )
+    goto denormal_arg;
+  else if ( st0_tag == TW_Infinity )
+    return;
+  else
+    single_arg_error(st0_ptr, st0_tag);
+}
+static int fsin(FPU_REG *st0_ptr, u_char tag)
+{
+  u_char arg_sign = getsign(st0_ptr);
+  if ( tag == TAG_Valid )
+    {
+      int q;
+      if ( exponent(st0_ptr) > -40 )
+        {
+          if ( (q = trig_arg(st0_ptr, 0)) == -1 )
+            {
+              /* Operand is out of range */
+              return 1;
+            }
+          poly_sine(st0_ptr);
+          
+          if (q & 2)
+            changesign(st0_ptr);
+          setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign);
+          /* We do not really know if up or down */
+          set_precision_flag_up();
+          return 0;
+        }
+      else
+        {
+          /* For a small arg, the result == the argument */
+          set_precision_flag_up();  /* Must be up. */
+          return 0;
+        }
+    }
+  if ( tag == TAG_Zero )
+    {
+      setcc(0);
+      return 0;
+    }
+  if ( tag == TAG_Special )
+    tag = FPU_Special(st0_ptr);
+  if ( tag == TW_Denormal )
+    {
+      if ( denormal_operand() < 0 )
+        return 1;
+      /* For a small arg, the result == the argument */
+      /* Underflow may happen */
+      FPU_to_exp16(st0_ptr, st0_ptr);
+      
+      tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
+      FPU_settag0(tag);
+      return 0;
+    }
+  else if ( tag == TW_Infinity )
+    {
+      /* The 80486 treats infinity as an invalid operand */
+      arith_invalid(0);
+      return 1;
+    }
+  else
+    {
+      single_arg_error(st0_ptr, tag);
+      return 1;
+    }
+}
+static int f_cos(FPU_REG *st0_ptr, u_char tag)
+{
+  u_char st0_sign;
+  st0_sign = getsign(st0_ptr);
+  if ( tag == TAG_Valid )
+    {
+      int q;
+      if ( exponent(st0_ptr) > -40 )
+        {
+          if ( (exponent(st0_ptr) < 0)
+              || ((exponent(st0_ptr) == 0)
+                  && (significand(st0_ptr) <= 0xc90fdaa22168c234LL)) )
+            {
+              poly_cos(st0_ptr);
+              /* We do not really know if up or down */
+              set_precision_flag_down();
+          
+              return 0;
+            }
+          else if ( (q = trig_arg(st0_ptr, FCOS)) != -1 )
+            {
+              poly_sine(st0_ptr);
+              if ((q+1) & 2)
+                changesign(st0_ptr);
+              /* We do not really know if up or down */
+              set_precision_flag_down();
+          
+              return 0;
+            }
+          else
+            {
+              /* Operand is out of range */
+              return 1;
+            }
+        }
+      else
+        {
+        denormal_arg:
+          setcc(0);
+          FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+#ifdef PECULIAR_486
+          set_precision_flag_down();  /* 80486 appears to do this. */
+#else
+          set_precision_flag_up();  /* Must be up. */
+#endif /* PECULIAR_486 */
+          return 0;
+        }
+    }
+  else if ( tag == TAG_Zero )
+    {
+      FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+      setcc(0);
+      return 0;
+    }
+  if ( tag == TAG_Special )
+    tag = FPU_Special(st0_ptr);
+  if ( tag == TW_Denormal )
+    {
+      if ( denormal_operand() < 0 )
+        return 1;
+      goto denormal_arg;
+    }
+  else if ( tag == TW_Infinity )
+    {
+      /* The 80486 treats infinity as an invalid operand */
+      arith_invalid(0);
+      return 1;
+    }
+  else
+    {
+      single_arg_error(st0_ptr, tag);  /* requires st0_ptr == &st(0) */
+      return 1;
+    }
+}
+static void fcos(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  f_cos(st0_ptr, st0_tag);
+}
+static void fsincos(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  FPU_REG *st_new_ptr;
+  FPU_REG arg;
+  u_char tag;
+  /* Stack underflow has higher priority */
+  if ( st0_tag == TAG_Empty )
+    {
+      FPU_stack_underflow();  /* Puts a QNaN in st(0) */
+      if ( control_word & CW_Invalid )
+        {
+          st_new_ptr = &st(-1);
+          push();
+          FPU_stack_underflow();  /* Puts a QNaN in the new st(0) */
+        }
+      return;
+    }
+  if ( STACK_OVERFLOW )
+    { FPU_stack_overflow(); return; }
+  if ( st0_tag == TAG_Special )
+    tag = FPU_Special(st0_ptr);
+  else
+    tag = st0_tag;
+  if ( tag == TW_NaN )
+    {
+      single_arg_2_error(st0_ptr, TW_NaN);
+      return;
+    }
+  else if ( tag == TW_Infinity )
+    {
+      /* The 80486 treats infinity as an invalid operand */
+      if ( arith_invalid(0) >= 0 )
+        {
+          /* Masked response */
+          push();
+          arith_invalid(0);
+        }
+      return;
+    }
+  reg_copy(st0_ptr, &arg);
+  if ( !fsin(st0_ptr, st0_tag) )
+    {
+      push();
+      FPU_copy_to_reg0(&arg, st0_tag);
+      f_cos(&st(0), st0_tag);
+    }
+  else
+    {
+      /* An error, so restore st(0) */
+      FPU_copy_to_reg0(&arg, st0_tag);
+    }
+}
+/*---------------------------------------------------------------------------*/
+/* The following all require two arguments: st(0) and st(1) */
+/* A lean, mean kernel for the fprem instructions. This relies upon
+   the division and rounding to an integer in do_fprem giving an
+   exact result. Because of this, rem_kernel() needs to deal only with
+   the least significant 64 bits, the more significant bits of the
+   result must be zero.
+ */
+static void rem_kernel(unsigned long long st0, unsigned long long *y,
+                       unsigned long long st1,
+                       unsigned long long q, int n)
+{
+  int dummy;
+  unsigned long long x;
+  x = st0 << n;
+  /* Do the required multiplication and subtraction in the one operation */
+  /* lsw x -= lsw st1 * lsw q */
+  asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1"
+                :"=m" (((unsigned *)&x)[0]), "=m" (((unsigned *)&x)[1]),
+                "=a" (dummy)
+                :"2" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[0])
+                :"%dx");
+  /* msw x -= msw st1 * lsw q */
+  asm volatile ("mull %3; subl %%eax,%0"
+                :"=m" (((unsigned *)&x)[1]), "=a" (dummy)
+                :"1" (((unsigned *)&st1)[1]), "m" (((unsigned *)&q)[0])
+                :"%dx");
+  /* msw x -= lsw st1 * msw q */
+  asm volatile ("mull %3; subl %%eax,%0"
+                :"=m" (((unsigned *)&x)[1]), "=a" (dummy)
+                :"1" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[1])
+                :"%dx");
+  *y = x;
+}
+/* Remainder of st(0) / st(1) */
+/* This routine produces exact results, i.e. there is never any
+   rounding or truncation, etc of the result. */
+static void do_fprem(FPU_REG *st0_ptr, u_char st0_tag, int round)
+{
+  FPU_REG *st1_ptr = &st(1);
+  u_char st1_tag = FPU_gettagi(1);
+  if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) )
+    {
+      FPU_REG tmp, st0, st1;
+      u_char st0_sign, st1_sign;
+      u_char tmptag;
+      int tag;
+      int old_cw;
+      int expdif;
+      long long q;
+      unsigned short saved_status;
+      int cc;
+    fprem_valid:
+      /* Convert registers for internal use. */
+      st0_sign = FPU_to_exp16(st0_ptr, &st0);
+      st1_sign = FPU_to_exp16(st1_ptr, &st1);
+      expdif = exponent16(&st0) - exponent16(&st1);
+      old_cw = control_word;
+      cc = 0;
+      /* We want the status following the denorm tests, but don't want
+         the status changed by the arithmetic operations. */
+      saved_status = partial_status;
+      control_word &= ~CW_RC;
+      control_word |= RC_CHOP;
+      if ( expdif < 64 )
+        {
+          /* This should be the most common case */
+          if ( expdif > -2 )
+            {
+              u_char sign = st0_sign ^ st1_sign;
+              tag = FPU_u_div(&st0, &st1, &tmp,
+                              PR_64_BITS | RC_CHOP | 0x3f,
+                              sign);
+              setsign(&tmp, sign);
+              if ( exponent(&tmp) >= 0 )
+                {
+                  FPU_round_to_int(&tmp, tag);  /* Fortunately, this can't
+                                                   overflow to 2^64 */
+                  q = significand(&tmp);
+                  rem_kernel(significand(&st0),
+                             &significand(&tmp),
+                             significand(&st1),
+                             q, expdif);
+                  setexponent16(&tmp, exponent16(&st1));
+                }
+              else
+                {
+                  reg_copy(&st0, &tmp);
+                  q = 0;
+                }
+              if ( (round == RC_RND) && (tmp.sigh & 0xc0000000) )
+                {
+                  /* We may need to subtract st(1) once more,
+                     to get a result <= 1/2 of st(1). */
+                  unsigned long long x;
+                  expdif = exponent16(&st1) - exponent16(&tmp);
+                  if ( expdif <= 1 )
+                    {
+                      if ( expdif == 0 )
+                        x = significand(&st1) - significand(&tmp);
+                      else /* expdif is 1 */
+                        x = (significand(&st1) << 1) - significand(&tmp);
+                      if ( (x < significand(&tmp)) ||
+                          /* or equi-distant (from 0 & st(1)) and q is odd */
+                          ((x == significand(&tmp)) && (q & 1) ) )
+                        {
+                          st0_sign = ! st0_sign;
+                          significand(&tmp) = x;
+                          q++;
+                        }
+                    }
+                }
+              if (q & 4) cc |= SW_C0;
+              if (q & 2) cc |= SW_C3;
+              if (q & 1) cc |= SW_C1;
+            }
+          else
+            {
+              control_word = old_cw;
+              setcc(0);
+              return;
+            }
+        }
+      else
+        {
+          /* There is a large exponent difference ( >= 64 ) */
+          /* To make much sense, the code in this section should
+             be done at high precision. */
+          int exp_1, N;
+          u_char sign;
+          /* prevent overflow here */
+          /* N is 'a number between 32 and 63' (p26-113) */
+          reg_copy(&st0, &tmp);
+          tmptag = st0_tag;
+          N = (expdif & 0x0000001f) + 32;  /* This choice gives results
+                                              identical to an AMD 486 */
+          setexponent16(&tmp, N);
+          exp_1 = exponent16(&st1);
+          setexponent16(&st1, 0);
+          expdif -= N;
+          sign = getsign(&tmp) ^ st1_sign;
+          tag = FPU_u_div(&tmp, &st1, &tmp, PR_64_BITS | RC_CHOP | 0x3f,
+                          sign);
+          setsign(&tmp, sign);
+          FPU_round_to_int(&tmp, tag);  /* Fortunately, this can't
+                                           overflow to 2^64 */
+          rem_kernel(significand(&st0),
+                     &significand(&tmp),
+                     significand(&st1),
+                     significand(&tmp),
+                     exponent(&tmp)
+                     ); 
+          setexponent16(&tmp, exp_1 + expdif);
+          /* It is possible for the operation to be complete here.
+             What does the IEEE standard say? The Intel 80486 manual
+             implies that the operation will never be completed at this
+             point, and the behaviour of a real 80486 confirms this.
+           */
+          if ( !(tmp.sigh | tmp.sigl) )
+            {
+              /* The result is zero */
+              control_word = old_cw;
+              partial_status = saved_status;
+              FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
+              setsign(&st0, st0_sign);
+#ifdef PECULIAR_486
+              setcc(SW_C2);
+#else
+              setcc(0);
+#endif /* PECULIAR_486 */
+              return;
+            }
+          cc = SW_C2;
+        }
+      control_word = old_cw;
+      partial_status = saved_status;
+      tag = FPU_normalize_nuo(&tmp);
+      reg_copy(&tmp, st0_ptr);
+      /* The only condition to be looked for is underflow,
+         and it can occur here only if underflow is unmasked. */
+      if ( (exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero)
+          && !(control_word & CW_Underflow) )
+        {
+          setcc(cc);
+          tag = arith_underflow(st0_ptr);
+          setsign(st0_ptr, st0_sign);
+          FPU_settag0(tag);
+          return;
+        }
+      else if ( (exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero) )
+        {
+          stdexp(st0_ptr);
+          setsign(st0_ptr, st0_sign);
+        }
+      else
+        {
+          tag = FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign);
+        }
+      FPU_settag0(tag);
+      setcc(cc);
+      return;
+    }
+  if ( st0_tag == TAG_Special )
+    st0_tag = FPU_Special(st0_ptr);
+  if ( st1_tag == TAG_Special )
+    st1_tag = FPU_Special(st1_ptr);
+  if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
+            || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
+            || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) )
+    {
+      if ( denormal_operand() < 0 )
+        return;
+      goto fprem_valid;
+    }
+  else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) )
+    {
+      FPU_stack_underflow();
+      return;
+    }
+  else if ( st0_tag == TAG_Zero )
+    {
+      if ( st1_tag == TAG_Valid )
+        {
+          setcc(0); return;
+        }
+      else if ( st1_tag == TW_Denormal )
+        {
+          if ( denormal_operand() < 0 )
+            return;
+          setcc(0); return;
+        }
+      else if ( st1_tag == TAG_Zero )
+        { arith_invalid(0); return; } /* fprem(?,0) always invalid */
+      else if ( st1_tag == TW_Infinity )
+        { setcc(0); return; }
+    }
+  else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) )
+    {
+      if ( st1_tag == TAG_Zero )
+        {
+          arith_invalid(0); /* fprem(Valid,Zero) is invalid */
+          return;
+        }
+      else if ( st1_tag != TW_NaN )
+        {
+          if ( ((st0_tag == TW_Denormal) || (st1_tag == TW_Denormal))
+               && (denormal_operand() < 0) )
+            return;
+          if ( st1_tag == TW_Infinity )
+            {
+              /* fprem(Valid,Infinity) is o.k. */
+              setcc(0); return;
+            }
+        }
+    }
+  else if ( st0_tag == TW_Infinity )
+    {
+      if ( st1_tag != TW_NaN )
+        {
+          arith_invalid(0); /* fprem(Infinity,?) is invalid */
+          return;
+        }
+    }
+  /* One of the registers must contain a NaN if we got here. */
+#ifdef PARANOID
+  if ( (st0_tag != TW_NaN) && (st1_tag != TW_NaN) )
+      EXCEPTION(EX_INTERNAL | 0x118);
+#endif /* PARANOID */
+  real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr);
+}
+/* ST(1) <- ST(1) * log ST;  pop ST */
+static void fyl2x(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  FPU_REG *st1_ptr = &st(1), exponent;
+  u_char st1_tag = FPU_gettagi(1);
+  u_char sign;
+  int e, tag;
+  clear_C1();
+  if ( (st0_tag == TAG_Valid) && (st1_tag == TAG_Valid) )
+    {
+    both_valid:
+      /* Both regs are Valid or Denormal */
+      if ( signpositive(st0_ptr) )
+        {
+          if ( st0_tag == TW_Denormal )
+            FPU_to_exp16(st0_ptr, st0_ptr);
+          else
+            /* Convert st(0) for internal use. */
+            setexponent16(st0_ptr, exponent(st0_ptr));
+          if ( (st0_ptr->sigh == 0x80000000) && (st0_ptr->sigl == 0) )
+            {
+              /* Special case. The result can be precise. */
+              u_char esign;
+              e = exponent16(st0_ptr);
+              if ( e >= 0 )
+                {
+                  exponent.sigh = e;
+                  esign = SIGN_POS;
+                }
+              else
+                {
+                  exponent.sigh = -e;
+                  esign = SIGN_NEG;
+                }
+              exponent.sigl = 0;
+              setexponent16(&exponent, 31);
+              tag = FPU_normalize_nuo(&exponent);
+              stdexp(&exponent);
+              setsign(&exponent, esign);
+              tag = FPU_mul(&exponent, tag, 1, FULL_PRECISION);
+              if ( tag >= 0 )
+                FPU_settagi(1, tag);
+            }
+          else
+            {
+              /* The usual case */
+              sign = getsign(st1_ptr);
+              if ( st1_tag == TW_Denormal )
+                FPU_to_exp16(st1_ptr, st1_ptr);
+              else
+                /* Convert st(1) for internal use. */
+                setexponent16(st1_ptr, exponent(st1_ptr));
+              poly_l2(st0_ptr, st1_ptr, sign);
+            }
+        }
+      else
+        {
+          /* negative */
+          if ( arith_invalid(1) < 0 )
+            return;
+        }
+      FPU_pop();
+      return;
+    }
+  if ( st0_tag == TAG_Special )
+    st0_tag = FPU_Special(st0_ptr);
+  if ( st1_tag == TAG_Special )
+    st1_tag = FPU_Special(st1_ptr);
+  if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) )
+    {
+      FPU_stack_underflow_pop(1);
+      return;
+    }
+  else if ( (st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal) )
+    {
+      if ( st0_tag == TAG_Zero )
+        {
+          if ( st1_tag == TAG_Zero )
+            {
+              /* Both args zero is invalid */
+              if ( arith_invalid(1) < 0 )
+                return;
+            }
+          else
+            {
+              u_char sign;
+              sign = getsign(st1_ptr)^SIGN_NEG;
+              if ( FPU_divide_by_zero(1, sign) < 0 )
+                return;
+              setsign(st1_ptr, sign);
+            }
+        }
+      else if ( st1_tag == TAG_Zero )
+        {
+          /* st(1) contains zero, st(0) valid <> 0 */
+          /* Zero is the valid answer */
+          sign = getsign(st1_ptr);
+          
+          if ( signnegative(st0_ptr) )
+            {
+              /* log(negative) */
+              if ( arith_invalid(1) < 0 )
+                return;
+            }
+          else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
+            return;
+          else
+            {
+              if ( exponent(st0_ptr) < 0 )
+                sign ^= SIGN_NEG;
+              FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
+              setsign(st1_ptr, sign);
+            }
+        }
+      else
+        {
+          /* One or both operands are denormals. */
+          if ( denormal_operand() < 0 )
+            return;
+          goto both_valid;
+        }
+    }
+  else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) )
+    {
+      if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
+        return;
+    }
+  /* One or both arg must be an infinity */
+  else if ( st0_tag == TW_Infinity )
+    {
+      if ( (signnegative(st0_ptr)) || (st1_tag == TAG_Zero) )
+        {
+          /* log(-infinity) or 0*log(infinity) */
+          if ( arith_invalid(1) < 0 )
+            return;
+        }
+      else
+        {
+          u_char sign = getsign(st1_ptr);
+          if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
+            return;
+          FPU_copy_to_reg1(&CONST_INF, TAG_Special);
+          setsign(st1_ptr, sign);
+        }
+    }
+  /* st(1) must be infinity here */
+  else if ( ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal))
+            && ( signpositive(st0_ptr) ) )
+    {
+      if ( exponent(st0_ptr) >= 0 )
+        {
+          if ( (exponent(st0_ptr) == 0) &&
+              (st0_ptr->sigh == 0x80000000) &&
+              (st0_ptr->sigl == 0) )
+            {
+              /* st(0) holds 1.0 */
+              /* infinity*log(1) */
+              if ( arith_invalid(1) < 0 )
+                return;
+            }
+          /* else st(0) is positive and > 1.0 */
+        }
+      else
+        {
+          /* st(0) is positive and < 1.0 */
+          if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
+            return;
+          changesign(st1_ptr);
+        }
+    }
+  else
+    {
+      /* st(0) must be zero or negative */
+      if ( st0_tag == TAG_Zero )
+        {
+          /* This should be invalid, but a real 80486 is happy with it. */
+#ifndef PECULIAR_486
+          sign = getsign(st1_ptr);
+          if ( FPU_divide_by_zero(1, sign) < 0 )
+            return;
+#endif /* PECULIAR_486 */
+          changesign(st1_ptr);
+        }
+      else if ( arith_invalid(1) < 0 )    /* log(negative) */
+        return;
+    }
+  FPU_pop();
+}
+static void fpatan(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  FPU_REG *st1_ptr = &st(1);
+  u_char st1_tag = FPU_gettagi(1);
+  int tag;
+  clear_C1();
+  if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) )
+    {
+    valid_atan:
+      poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag);
+      FPU_pop();
+      return;
+    }
+  if ( st0_tag == TAG_Special )
+    st0_tag = FPU_Special(st0_ptr);
+  if ( st1_tag == TAG_Special )
+    st1_tag = FPU_Special(st1_ptr);
+  if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
+            || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
+            || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) )
+    {
+      if ( denormal_operand() < 0 )
+        return;
+      goto valid_atan;
+    }
+  else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) )
+    {
+      FPU_stack_underflow_pop(1);
+      return;
+    }
+  else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) )
+    {
+      if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0 )
+          FPU_pop();
+      return;
+    }
+  else if ( (st0_tag == TW_Infinity) || (st1_tag == TW_Infinity) )
+    {
+      u_char sign = getsign(st1_ptr);
+      if ( st0_tag == TW_Infinity )
+        {
+          if ( st1_tag == TW_Infinity )
+            {
+              if ( signpositive(st0_ptr) )
+                {
+                  FPU_copy_to_reg1(&CONST_PI4, TAG_Valid);
+                }
+              else
+                {
+                  setpositive(st1_ptr);
+                  tag = FPU_u_add(&CONST_PI4, &CONST_PI2, st1_ptr,
+                                  FULL_PRECISION, SIGN_POS,
+                                  exponent(&CONST_PI4), exponent(&CONST_PI2));
+                  if ( tag >= 0 )
+                    FPU_settagi(1, tag);
+                }
+            }
+          else
+            {
+              if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
+                return;
+              if ( signpositive(st0_ptr) )
+                {
+                  FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
+                  setsign(st1_ptr, sign);   /* An 80486 preserves the sign */
+                  FPU_pop();
+                  return;
+                }
+              else
+                {
+                  FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
+                }
+            }
+        }
+      else
+        {
+          /* st(1) is infinity, st(0) not infinity */
+          if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
+            return;
+          FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
+        }
+      setsign(st1_ptr, sign);
+    }
+  else if ( st1_tag == TAG_Zero )
+    {
+      /* st(0) must be valid or zero */
+      u_char sign = getsign(st1_ptr);
+      if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
+        return;
+      if ( signpositive(st0_ptr) )
+        {
+          /* An 80486 preserves the sign */
+          FPU_pop();
+          return;
+        }
+      FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
+      setsign(st1_ptr, sign);
+    }
+  else if ( st0_tag == TAG_Zero )
+    {
+      /* st(1) must be TAG_Valid here */
+      u_char sign = getsign(st1_ptr);
+      if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
+        return;
+      FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
+      setsign(st1_ptr, sign);
+    }
+#ifdef PARANOID
+  else
+    EXCEPTION(EX_INTERNAL | 0x125);
+#endif /* PARANOID */
+  FPU_pop();
+  set_precision_flag_up();  /* We do not really know if up or down */
+}
+static void fprem(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  do_fprem(st0_ptr, st0_tag, RC_CHOP);
+}
+static void fprem1(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  do_fprem(st0_ptr, st0_tag, RC_RND);
+}
+static void fyl2xp1(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  u_char sign, sign1;
+  FPU_REG *st1_ptr = &st(1), a, b;
+  u_char st1_tag = FPU_gettagi(1);
+  clear_C1();
+  if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) )
+    {
+    valid_yl2xp1:
+      sign = getsign(st0_ptr);
+      sign1 = getsign(st1_ptr);
+      FPU_to_exp16(st0_ptr, &a);
+      FPU_to_exp16(st1_ptr, &b);
+      if ( poly_l2p1(sign, sign1, &a, &b, st1_ptr) )
+        return;
+      FPU_pop();
+      return;
+    }
+  if ( st0_tag == TAG_Special )
+    st0_tag = FPU_Special(st0_ptr);
+  if ( st1_tag == TAG_Special )
+    st1_tag = FPU_Special(st1_ptr);
+  if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
+            || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
+            || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) )
+    {
+      if ( denormal_operand() < 0 )
+        return;
+      goto valid_yl2xp1;
+    }
+  else if ( (st0_tag == TAG_Empty) | (st1_tag == TAG_Empty) )
+    {
+      FPU_stack_underflow_pop(1);
+      return;
+    }
+  else if ( st0_tag == TAG_Zero )
+    {
+      switch ( st1_tag )
+        {
+        case TW_Denormal:
+          if ( denormal_operand() < 0 )
+            return;
+        case TAG_Zero:
+        case TAG_Valid:
+          setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr));
+          FPU_copy_to_reg1(st0_ptr, st0_tag);
+          break;
+        case TW_Infinity:
+          /* Infinity*log(1) */
+          if ( arith_invalid(1) < 0 )
+            return;
+          break;
+        case TW_NaN:
+          if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
+            return;
+          break;
+        default:
+#ifdef PARANOID
+          EXCEPTION(EX_INTERNAL | 0x116);
+          return;
+#endif /* PARANOID */
+          break;
+        }
+    }
+  else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) )
+    {
+      switch ( st1_tag )
+        {
+        case TAG_Zero:
+          if ( signnegative(st0_ptr) )
+            {
+              if ( exponent(st0_ptr) >= 0 )
+                {
+                  /* st(0) holds <= -1.0 */
+#ifdef PECULIAR_486   /* Stupid 80486 doesn't worry about log(negative). */
+                  changesign(st1_ptr);
+#else
+                  if ( arith_invalid(1) < 0 )
+                    return;
+#endif /* PECULIAR_486 */
+                }
+              else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
+                return;
+              else
+                changesign(st1_ptr);
+            }
+          else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
+            return;
+          break;
+        case TW_Infinity:
+          if ( signnegative(st0_ptr) )
+            {
+              if ( (exponent(st0_ptr) >= 0) &&
+                  !((st0_ptr->sigh == 0x80000000) &&
+                    (st0_ptr->sigl == 0)) )
+                {
+                  /* st(0) holds < -1.0 */
+#ifdef PECULIAR_486   /* Stupid 80486 doesn't worry about log(negative). */
+                  changesign(st1_ptr);
+#else
+                  if ( arith_invalid(1) < 0 ) return;
+#endif /* PECULIAR_486 */
+                }
+              else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
+                return;
+              else
+                changesign(st1_ptr);
+            }
+          else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
+            return;
+          break;
+        case TW_NaN:
+          if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
+            return;
+        }
+    }
+  else if ( st0_tag == TW_NaN )
+    {
+      if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
+        return;
+    }
+  else if ( st0_tag == TW_Infinity )
+    {
+      if ( st1_tag == TW_NaN )
+        {
+          if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
+            return;
+        }
+      else if ( signnegative(st0_ptr) )
+        {
+#ifndef PECULIAR_486
+          /* This should have higher priority than denormals, but... */
+          if ( arith_invalid(1) < 0 )  /* log(-infinity) */
+            return;
+#endif /* PECULIAR_486 */
+          if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
+            return;
+#ifdef PECULIAR_486
+          /* Denormal operands actually get higher priority */
+          if ( arith_invalid(1) < 0 )  /* log(-infinity) */
+            return;
+#endif /* PECULIAR_486 */
+        }
+      else if ( st1_tag == TAG_Zero )
+        {
+          /* log(infinity) */
+          if ( arith_invalid(1) < 0 )
+            return;
+        }
+        
+      /* st(1) must be valid here. */
+      else if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
+        return;
+      /* The Manual says that log(Infinity) is invalid, but a real
+         80486 sensibly says that it is o.k. */
+      else
+        {
+          u_char sign = getsign(st1_ptr);
+          FPU_copy_to_reg1(&CONST_INF, TAG_Special);
+          setsign(st1_ptr, sign);
+        }
+    }
+#ifdef PARANOID
+  else
+    {
+      EXCEPTION(EX_INTERNAL | 0x117);
+      return;
+    }
+#endif /* PARANOID */
+  FPU_pop();
+  return;
+}
+static void fscale(FPU_REG *st0_ptr, u_char st0_tag)
+{
+  FPU_REG *st1_ptr = &st(1);
+  u_char st1_tag = FPU_gettagi(1);
+  int old_cw = control_word;
+  u_char sign = getsign(st0_ptr);
+  clear_C1();
+  if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) )
+    {
+      long scale;
+      FPU_REG tmp;
+      /* Convert register for internal use. */
+      setexponent16(st0_ptr, exponent(st0_ptr));
+    valid_scale:
+      if ( exponent(st1_ptr) > 30 )
+        {
+          /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */
+          if ( signpositive(st1_ptr) )
+            {
+              EXCEPTION(EX_Overflow);
+              FPU_copy_to_reg0(&CONST_INF, TAG_Special);
+            }
+          else
+            {
+              EXCEPTION(EX_Underflow);
+              FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
+            }
+          setsign(st0_ptr, sign);
+          return;
+        }
+      control_word &= ~CW_RC;
+      control_word |= RC_CHOP;
+      reg_copy(st1_ptr, &tmp);
+      FPU_round_to_int(&tmp, st1_tag);      /* This can never overflow here */
+      control_word = old_cw;
+      scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl;
+      scale += exponent16(st0_ptr);
+      setexponent16(st0_ptr, scale);
+      /* Use FPU_round() to properly detect under/overflow etc */
+      FPU_round(st0_ptr, 0, 0, control_word, sign);
+      return;
+    }
+  if ( st0_tag == TAG_Special )
+    st0_tag = FPU_Special(st0_ptr);
+  if ( st1_tag == TAG_Special )
+    st1_tag = FPU_Special(st1_ptr);
+  if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) )
+    {
+      switch ( st1_tag )
+        {
+        case TAG_Valid:
+          /* st(0) must be a denormal */
+          if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
+            return;
+          FPU_to_exp16(st0_ptr, st0_ptr);  /* Will not be left on stack */
+          goto valid_scale;
+        case TAG_Zero:
+          if ( st0_tag == TW_Denormal )
+            denormal_operand();
+          return;
+        case TW_Denormal:
+          denormal_operand();
+          return;
+        case TW_Infinity:
+          if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
+            return;
+          if ( signpositive(st1_ptr) )
+            FPU_copy_to_reg0(&CONST_INF, TAG_Special);
+          else
+            FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
+          setsign(st0_ptr, sign);
+          return;
+        case TW_NaN:
+          real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
+          return;
+        }
+    }
+  else if ( st0_tag == TAG_Zero )
+    {
+      switch ( st1_tag )
+        {
+        case TAG_Valid:
+        case TAG_Zero:
+          return;
+        case TW_Denormal:
+          denormal_operand();
+          return;
+        case TW_Infinity:
+          if ( signpositive(st1_ptr) )
+            arith_invalid(0); /* Zero scaled by +Infinity */
+          return;
+        case TW_NaN:
+          real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
+          return;
+        }
+    }
+  else if ( st0_tag == TW_Infinity )
+    {
+      switch ( st1_tag )
+        {
+        case TAG_Valid:
+        case TAG_Zero:
+          return;
+        case TW_Denormal:
+          denormal_operand();
+          return;
+        case TW_Infinity:
+          if ( signnegative(st1_ptr) )
+            arith_invalid(0); /* Infinity scaled by -Infinity */
+          return;
+        case TW_NaN:
+          real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
+          return;
+        }
+    }
+  else if ( st0_tag == TW_NaN )
+    {
+      if ( st1_tag != TAG_Empty )
+        { real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); return; }
+    }
+#ifdef PARANOID
+  if ( !((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) )
+    {
+      EXCEPTION(EX_INTERNAL | 0x115);
+      return;
+    }
+#endif
+  /* At least one of st(0), st(1) must be empty */
+  FPU_stack_underflow();
+}
+/*---------------------------------------------------------------------------*/
+static FUNC_ST0 const trig_table_a[] = {
+  f2xm1, fyl2x, fptan, fpatan,
+  fxtract, fprem1, (FUNC_ST0)fdecstp, (FUNC_ST0)fincstp
+};
+void FPU_triga(void)
+{
+  (trig_table_a[FPU_rm])(&st(0), FPU_gettag0());
+}
+static FUNC_ST0 const trig_table_b[] =
+  {
+    fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0)fsin, fcos
+  };
+void FPU_trigb(void)
+{
+  (trig_table_b[FPU_rm])(&st(0), FPU_gettag0());
+}
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
new file mode 100644
index 000000000000..2e2c51a8bd3a
--- /dev/null
+++ b/arch/x86/math-emu/get_address.c
@@ -0,0 +1,438 @@
+/*---------------------------------------------------------------------------+
+ |  get_address.c                                                            |
+ |                                                                           |
+ | Get the effective address from an FPU instruction.                        |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1997                                         |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail   billm@suburbia.net             |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ | Note:                                                                     |
+ |    The file contains code which accesses user memory.                     |
+ |    Emulator static data may change when user memory is accessed, due to   |
+ |    other processes using the emulator while swapping is in progress.      |
+ +---------------------------------------------------------------------------*/
+#include <linux/stddef.h>
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+#include "fpu_system.h"
+#include "exception.h"
+#include "fpu_emu.h"
+#define FPU_WRITE_BIT 0x10
+static int reg_offset[] = {
+        offsetof(struct info,___eax),
+        offsetof(struct info,___ecx),
+        offsetof(struct info,___edx),
+        offsetof(struct info,___ebx),
+        offsetof(struct info,___esp),
+        offsetof(struct info,___ebp),
+        offsetof(struct info,___esi),
+        offsetof(struct info,___edi)
+};
+#define REG_(x) (*(long *)(reg_offset[(x)]+(u_char *) FPU_info))
+static int reg_offset_vm86[] = {
+        offsetof(struct info,___cs),
+        offsetof(struct info,___vm86_ds),
+        offsetof(struct info,___vm86_es),
+        offsetof(struct info,___vm86_fs),
+        offsetof(struct info,___vm86_gs),
+        offsetof(struct info,___ss),
+        offsetof(struct info,___vm86_ds)
+      };
+#define VM86_REG_(x) (*(unsigned short *) \
+                      (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info))
+/* This dummy, gs is not saved on the stack. */
+#define ___GS ___ds
+static int reg_offset_pm[] = {
+        offsetof(struct info,___cs),
+        offsetof(struct info,___ds),
+        offsetof(struct info,___es),
+        offsetof(struct info,___fs),
+        offsetof(struct info,___GS),
+        offsetof(struct info,___ss),
+        offsetof(struct info,___ds)
+      };
+#define PM_REG_(x) (*(unsigned short *) \
+                      (reg_offset_pm[((unsigned)x)]+(u_char *) FPU_info))
+/* Decode the SIB byte. This function assumes mod != 0 */
+static int sib(int mod, unsigned long *fpu_eip)
+{
+  u_char ss,index,base;
+  long offset;
+  RE_ENTRANT_CHECK_OFF;
+  FPU_code_access_ok(1);
+  FPU_get_user(base, (u_char __user *) (*fpu_eip));   /* The SIB byte */
+  RE_ENTRANT_CHECK_ON;
+  (*fpu_eip)++;
+  ss = base >> 6;
+  index = (base >> 3) & 7;
+  base &= 7;
+  if ((mod == 0) && (base == 5))
+    offset = 0;              /* No base register */
+  else
+    offset = REG_(base);
+  if (index == 4)
+    {
+      /* No index register */
+      /* A non-zero ss is illegal */
+      if ( ss )
+        EXCEPTION(EX_Invalid);
+    }
+  else
+    {
+      offset += (REG_(index)) << ss;
+    }
+  if (mod == 1)
+    {
+      /* 8 bit signed displacement */
+      long displacement;
+      RE_ENTRANT_CHECK_OFF;
+      FPU_code_access_ok(1);
+      FPU_get_user(displacement, (signed char __user *) (*fpu_eip));
+      offset += displacement;
+      RE_ENTRANT_CHECK_ON;
+      (*fpu_eip)++;
+    }
+  else if (mod == 2 || base == 5) /* The second condition also has mod==0 */
+    {
+      /* 32 bit displacement */
+      long displacement;
+      RE_ENTRANT_CHECK_OFF;
+      FPU_code_access_ok(4);
+      FPU_get_user(displacement, (long __user *) (*fpu_eip));
+      offset += displacement;
+      RE_ENTRANT_CHECK_ON;
+      (*fpu_eip) += 4;
+    }
+  return offset;
+}
+static unsigned long vm86_segment(u_char segment,
+                                  struct address *addr)
+{
+  segment--;
+#ifdef PARANOID
+  if ( segment > PREFIX_SS_ )
+    {
+      EXCEPTION(EX_INTERNAL|0x130);
+      math_abort(FPU_info,SIGSEGV);
+    }
+#endif /* PARANOID */
+  addr->selector = VM86_REG_(segment);
+  return (unsigned long)VM86_REG_(segment) << 4;
+}
+/* This should work for 16 and 32 bit protected mode. */
+static long pm_address(u_char FPU_modrm, u_char segment,
+                       struct address *addr, long offset)
+{ 
+  struct desc_struct descriptor;
+  unsigned long base_address, limit, address, seg_top;
+  segment--;
+#ifdef PARANOID
+  /* segment is unsigned, so this also detects if segment was 0: */
+  if ( segment > PREFIX_SS_ )
+    {
+      EXCEPTION(EX_INTERNAL|0x132);
+      math_abort(FPU_info,SIGSEGV);
+    }
+#endif /* PARANOID */
+  switch ( segment )
+    {
+      /* gs isn't used by the kernel, so it still has its
+         user-space value. */
+    case PREFIX_GS_-1:
+      /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
+      savesegment(gs, addr->selector);
+      break;
+    default:
+      addr->selector = PM_REG_(segment);
+    }
+  descriptor = LDT_DESCRIPTOR(PM_REG_(segment));
+  base_address = SEG_BASE_ADDR(descriptor);
+  address = base_address + offset;
+  limit = base_address
+        + (SEG_LIMIT(descriptor)+1) * SEG_GRANULARITY(descriptor) - 1;
+  if ( limit < base_address ) limit = 0xffffffff;
+  if ( SEG_EXPAND_DOWN(descriptor) )
+    {
+      if ( SEG_G_BIT(descriptor) )
+        seg_top = 0xffffffff;
+      else
+        {
+          seg_top = base_address + (1 << 20);
+          if ( seg_top < base_address ) seg_top = 0xffffffff;
+        }
+      access_limit =
+        (address <= limit) || (address >= seg_top) ? 0 :
+          ((seg_top-address) >= 255 ? 255 : seg_top-address);
+    }
+  else
+    {
+      access_limit =
+        (address > limit) || (address < base_address) ? 0 :
+          ((limit-address) >= 254 ? 255 : limit-address+1);
+    }
+  if ( SEG_EXECUTE_ONLY(descriptor) ||
+      (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT)) )
+    {
+      access_limit = 0;
+    }
+  return address;
+}
+/*
+       MOD R/M byte:  MOD == 3 has a special use for the FPU
+                      SIB byte used iff R/M = 100b
+       7   6   5   4   3   2   1   0
+       .....   .........   .........
+        MOD    OPCODE(2)     R/M
+       SIB byte
+       7   6   5   4   3   2   1   0
+       .....   .........   .........
+        SS      INDEX        BASE
+*/
+void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip,
+                  struct address *addr,
+                  fpu_addr_modes addr_modes)
+{
+  u_char mod;
+  unsigned rm = FPU_modrm & 7;
+  long *cpu_reg_ptr;
+  int address = 0;     /* Initialized just to stop compiler warnings. */
+  /* Memory accessed via the cs selector is write protected
+     in `non-segmented' 32 bit protected mode. */
+  if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
+      && (addr_modes.override.segment == PREFIX_CS_) )
+    {
+      math_abort(FPU_info,SIGSEGV);
+    }
+  addr->selector = FPU_DS;   /* Default, for 32 bit non-segmented mode. */
+  mod = (FPU_modrm >> 6) & 3;
+  if (rm == 4 && mod != 3)
+    {
+      address = sib(mod, fpu_eip);
+    }
+  else
+    {
+      cpu_reg_ptr = & REG_(rm);
+      switch (mod)
+        {
+        case 0:
+          if (rm == 5)
+            {
+              /* Special case: disp32 */
+              RE_ENTRANT_CHECK_OFF;
+              FPU_code_access_ok(4);
+              FPU_get_user(address, (unsigned long __user *) (*fpu_eip));
+              (*fpu_eip) += 4;
+              RE_ENTRANT_CHECK_ON;
+              addr->offset = address;
+              return (void __user *) address;
+            }
+          else
+            {
+              address = *cpu_reg_ptr;  /* Just return the contents
+                                          of the cpu register */
+              addr->offset = address;
+              return (void __user *) address;
+            }
+        case 1:
+          /* 8 bit signed displacement */
+          RE_ENTRANT_CHECK_OFF;
+          FPU_code_access_ok(1);
+          FPU_get_user(address, (signed char __user *) (*fpu_eip));
+          RE_ENTRANT_CHECK_ON;
+          (*fpu_eip)++;
+          break;
+        case 2:
+          /* 32 bit displacement */
+          RE_ENTRANT_CHECK_OFF;
+          FPU_code_access_ok(4);
+          FPU_get_user(address, (long __user *) (*fpu_eip));
+          (*fpu_eip) += 4;
+          RE_ENTRANT_CHECK_ON;
+          break;
+        case 3:
+          /* Not legal for the FPU */
+          EXCEPTION(EX_Invalid);
+        }
+      address += *cpu_reg_ptr;
+    }
+  addr->offset = address;
+  switch ( addr_modes.default_mode )
+    {
+    case 0:
+      break;
+    case VM86:
+      address += vm86_segment(addr_modes.override.segment, addr);
+      break;
+    case PM16:
+    case SEG32:
+      address = pm_address(FPU_modrm, addr_modes.override.segment,
+                           addr, address);
+      break;
+    default:
+      EXCEPTION(EX_INTERNAL|0x133);
+    }
+  return (void __user *)address;
+}
+void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
+                     struct address *addr,
+                     fpu_addr_modes addr_modes)
+{
+  u_char mod;
+  unsigned rm = FPU_modrm & 7;
+  int address = 0;     /* Default used for mod == 0 */
+  /* Memory accessed via the cs selector is write protected
+     in `non-segmented' 32 bit protected mode. */
+  if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
+      && (addr_modes.override.segment == PREFIX_CS_) )
+    {
+      math_abort(FPU_info,SIGSEGV);
+    }
+  addr->selector = FPU_DS;   /* Default, for 32 bit non-segmented mode. */
+  mod = (FPU_modrm >> 6) & 3;
+  switch (mod)
+    {
+    case 0:
+      if (rm == 6)
+        {
+          /* Special case: disp16 */
+          RE_ENTRANT_CHECK_OFF;
+          FPU_code_access_ok(2);
+          FPU_get_user(address, (unsigned short __user *) (*fpu_eip));
+          (*fpu_eip) += 2;
+          RE_ENTRANT_CHECK_ON;
+          goto add_segment;
+        }
+      break;
+    case 1:
+      /* 8 bit signed displacement */
+      RE_ENTRANT_CHECK_OFF;
+      FPU_code_access_ok(1);
+      FPU_get_user(address, (signed char __user *) (*fpu_eip));
+      RE_ENTRANT_CHECK_ON;
+      (*fpu_eip)++;
+      break;
+    case 2:
+      /* 16 bit displacement */
+      RE_ENTRANT_CHECK_OFF;
+      FPU_code_access_ok(2);
+      FPU_get_user(address, (unsigned short __user *) (*fpu_eip));
+      (*fpu_eip) += 2;
+      RE_ENTRANT_CHECK_ON;
+      break;
+    case 3:
+      /* Not legal for the FPU */
+      EXCEPTION(EX_Invalid);
+      break;
+    }
+  switch ( rm )
+    {
+    case 0:
+      address += FPU_info->___ebx + FPU_info->___esi;
+      break;
+    case 1:
+      address += FPU_info->___ebx + FPU_info->___edi;
+      break;
+    case 2:
+      address += FPU_info->___ebp + FPU_info->___esi;
+      if ( addr_modes.override.segment == PREFIX_DEFAULT )
+        addr_modes.override.segment = PREFIX_SS_;
+      break;
+    case 3:
+      address += FPU_info->___ebp + FPU_info->___edi;
+      if ( addr_modes.override.segment == PREFIX_DEFAULT )
+        addr_modes.override.segment = PREFIX_SS_;
+      break;
+    case 4:
+      address += FPU_info->___esi;
+      break;
+    case 5:
+      address += FPU_info->___edi;
+      break;
+    case 6:
+      address += FPU_info->___ebp;
+      if ( addr_modes.override.segment == PREFIX_DEFAULT )
+        addr_modes.override.segment = PREFIX_SS_;
+      break;
+    case 7:
+      address += FPU_info->___ebx;
+      break;
+    }
+ add_segment:
+  address &= 0xffff;
+  addr->offset = address;
+  switch ( addr_modes.default_mode )
+    {
+    case 0:
+      break;
+    case VM86:
+      address += vm86_segment(addr_modes.override.segment, addr);
+      break;
+    case PM16:
+    case SEG32:
+      address = pm_address(FPU_modrm, addr_modes.override.segment,
+                           addr, address);
+      break;
+    default:
+      EXCEPTION(EX_INTERNAL|0x131);
+    }
+  return (void __user *)address ;
+}
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
new file mode 100644
index 000000000000..eebd6fb1c8a8
--- /dev/null
+++ b/arch/x86/math-emu/load_store.c
@@ -0,0 +1,272 @@
+/*---------------------------------------------------------------------------+
+ |  load_store.c                                                             |
+ |                                                                           |
+ | This file contains most of the code to interpret the FPU instructions     |
+ | which load and store from user memory.                                    |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1997                                         |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail   billm@suburbia.net             |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ | Note:                                                                     |
+ |    The file contains code which accesses user memory.                     |
+ |    Emulator static data may change when user memory is accessed, due to   |
+ |    other processes using the emulator while swapping is in progress.      |
+ +---------------------------------------------------------------------------*/
+#include <asm/uaccess.h>
+#include "fpu_system.h"
+#include "exception.h"
+#include "fpu_emu.h"
+#include "status_w.h"
+#include "control_w.h"
+#define _NONE_ 0   /* st0_ptr etc not needed */
+#define _REG0_ 1   /* Will be storing st(0) */
+#define _PUSH_ 3   /* Need to check for space to push onto stack */
+#define _null_ 4   /* Function illegal or not implemented */
+#define pop_0() { FPU_settag0(TAG_Empty); top++; }
+static u_char const type_table[32] = {
+  _PUSH_, _PUSH_, _PUSH_, _PUSH_,
+  _null_, _null_, _null_, _null_,
+  _REG0_, _REG0_, _REG0_, _REG0_,
+  _REG0_, _REG0_, _REG0_, _REG0_,
+  _NONE_, _null_, _NONE_, _PUSH_,
+  _NONE_, _PUSH_, _null_, _PUSH_,
+  _NONE_, _null_, _NONE_, _REG0_,
+  _NONE_, _REG0_, _NONE_, _REG0_
+  };
+u_char const data_sizes_16[32] = {
+  4,  4,  8,  2,  0,  0,  0,  0,
+  4,  4,  8,  2,  4,  4,  8,  2,
+  14, 0, 94, 10,  2, 10,  0,  8,  
+  14, 0, 94, 10,  2, 10,  2,  8
+};
+static u_char const data_sizes_32[32] = {
+  4,  4,  8,  2,  0,  0,  0,  0,
+  4,  4,  8,  2,  4,  4,  8,  2,
+  28, 0,108, 10,  2, 10,  0,  8,  
+  28, 0,108, 10,  2, 10,  2,  8
+};
+int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
+                     void __user *data_address)
+{
+  FPU_REG loaded_data;
+  FPU_REG *st0_ptr;
+  u_char st0_tag = TAG_Empty;  /* This is just to stop a gcc warning. */
+  u_char loaded_tag;
+  st0_ptr = NULL;    /* Initialized just to stop compiler warnings. */
+  if ( addr_modes.default_mode & PROTECTED )
+    {
+      if ( addr_modes.default_mode == SEG32 )
+        {
+          if ( access_limit < data_sizes_32[type] )
+            math_abort(FPU_info,SIGSEGV);
+        }
+      else if ( addr_modes.default_mode == PM16 )
+        {
+          if ( access_limit < data_sizes_16[type] )
+            math_abort(FPU_info,SIGSEGV);
+        }
+#ifdef PARANOID
+      else
+        EXCEPTION(EX_INTERNAL|0x140);
+#endif /* PARANOID */
+    }
+  switch ( type_table[type] )
+    {
+    case _NONE_:
+      break;
+    case _REG0_:
+      st0_ptr = &st(0);       /* Some of these instructions pop after
+                                 storing */
+      st0_tag = FPU_gettag0();
+      break;
+    case _PUSH_:
+      {
+        if ( FPU_gettagi(-1) != TAG_Empty )
+          { FPU_stack_overflow(); return 0; }
+        top--;
+        st0_ptr = &st(0);
+      }
+      break;
+    case _null_:
+      FPU_illegal();
+      return 0;
+#ifdef PARANOID
+    default:
+      EXCEPTION(EX_INTERNAL|0x141);
+      return 0;
+#endif /* PARANOID */
+    }
+  switch ( type )
+    {
+    case 000:       /* fld m32real */
+      clear_C1();
+      loaded_tag = FPU_load_single((float __user *)data_address, &loaded_data);
+      if ( (loaded_tag == TAG_Special)
+           && isNaN(&loaded_data)
+           && (real_1op_NaN(&loaded_data) < 0) )
+        {
+          top++;
+          break;
+        }
+      FPU_copy_to_reg0(&loaded_data, loaded_tag);
+      break;
+    case 001:      /* fild m32int */
+      clear_C1();
+      loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data);
+      FPU_copy_to_reg0(&loaded_data, loaded_tag);
+      break;
+    case 002:      /* fld m64real */
+      clear_C1();
+      loaded_tag = FPU_load_double((double __user *)data_address, &loaded_data);
+      if ( (loaded_tag == TAG_Special)
+           && isNaN(&loaded_data)
+           && (real_1op_NaN(&loaded_data) < 0) )
+        {
+          top++;
+          break;
+        }
+      FPU_copy_to_reg0(&loaded_data, loaded_tag);
+      break;
+    case 003:      /* fild m16int */
+      clear_C1();
+      loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data);
+      FPU_copy_to_reg0(&loaded_data, loaded_tag);
+      break;
+    case 010:      /* fst m32real */
+      clear_C1();
+      FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address);
+      break;
+    case 011:      /* fist m32int */
+      clear_C1();
+      FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address);
+      break;
+    case 012:     /* fst m64real */
+      clear_C1();
+      FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address);
+      break;
+    case 013:     /* fist m16int */
+      clear_C1();
+      FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address);
+      break;
+    case 014:     /* fstp m32real */
+      clear_C1();
+      if ( FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address) )
+        pop_0();  /* pop only if the number was actually stored
+                     (see the 80486 manual p16-28) */
+      break;
+    case 015:     /* fistp m32int */
+      clear_C1();
+      if ( FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address) )
+        pop_0();  /* pop only if the number was actually stored
+                     (see the 80486 manual p16-28) */
+      break;
+    case 016:     /* fstp m64real */
+      clear_C1();
+      if ( FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address) )
+        pop_0();  /* pop only if the number was actually stored
+                     (see the 80486 manual p16-28) */
+      break;
+    case 017:     /* fistp m16int */
+      clear_C1();
+      if ( FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address) )
+        pop_0();  /* pop only if the number was actually stored
+                     (see the 80486 manual p16-28) */
+      break;
+    case 020:     /* fldenv  m14/28byte */
+      fldenv(addr_modes, (u_char __user *)data_address);
+      /* Ensure that the values just loaded are not changed by
+         fix-up operations. */
+      return 1;
+    case 022:     /* frstor m94/108byte */
+      frstor(addr_modes, (u_char __user *)data_address);
+      /* Ensure that the values just loaded are not changed by
+         fix-up operations. */
+      return 1;
+    case 023:     /* fbld m80dec */
+      clear_C1();
+      loaded_tag = FPU_load_bcd((u_char __user *)data_address);
+      FPU_settag0(loaded_tag);
+      break;
+    case 024:     /* fldcw */
+      RE_ENTRANT_CHECK_OFF;
+      FPU_access_ok(VERIFY_READ, data_address, 2);
+      FPU_get_user(control_word, (unsigned short __user *) data_address);
+      RE_ENTRANT_CHECK_ON;
+      if ( partial_status & ~control_word & CW_Exceptions )
+        partial_status |= (SW_Summary | SW_Backward);
+      else
+        partial_status &= ~(SW_Summary | SW_Backward);
+#ifdef PECULIAR_486
+      control_word |= 0x40;  /* An 80486 appears to always set this bit */
+#endif /* PECULIAR_486 */
+      return 1;
+    case 025:      /* fld m80real */
+      clear_C1();
+      loaded_tag = FPU_load_extended((long double __user *)data_address, 0);
+      FPU_settag0(loaded_tag);
+      break;
+    case 027:      /* fild m64int */
+      clear_C1();
+      loaded_tag = FPU_load_int64((long long __user *)data_address);
+      if (loaded_tag == TAG_Error)
+        return 0;
+      FPU_settag0(loaded_tag);
+      break;
+    case 030:     /* fstenv  m14/28byte */
+      fstenv(addr_modes, (u_char __user *)data_address);
+      return 1;
+    case 032:      /* fsave */
+      fsave(addr_modes, (u_char __user *)data_address);
+      return 1;
+    case 033:      /* fbstp m80dec */
+      clear_C1();
+      if ( FPU_store_bcd(st0_ptr, st0_tag, (u_char __user *)data_address) )
+        pop_0();  /* pop only if the number was actually stored
+                     (see the 80486 manual p16-28) */
+      break;
+    case 034:      /* fstcw m16int */
+      RE_ENTRANT_CHECK_OFF;
+      FPU_access_ok(VERIFY_WRITE,data_address,2);
+      FPU_put_user(control_word, (unsigned short __user *) data_address);
+      RE_ENTRANT_CHECK_ON;
+      return 1;
+    case 035:      /* fstp m80real */
+      clear_C1();
+      if ( FPU_store_extended(st0_ptr, st0_tag, (long double __user *)data_address) )
+        pop_0();  /* pop only if the number was actually stored
+                     (see the 80486 manual p16-28) */
+      break;
+    case 036:      /* fstsw m2byte */
+      RE_ENTRANT_CHECK_OFF;
+      FPU_access_ok(VERIFY_WRITE,data_address,2);
+      FPU_put_user(status_word(),(unsigned short __user *) data_address);
+      RE_ENTRANT_CHECK_ON;
+      return 1;
+    case 037:      /* fistp m64int */
+      clear_C1();
+      if ( FPU_store_int64(st0_ptr, st0_tag, (long long __user *)data_address) )
+        pop_0();  /* pop only if the number was actually stored
+                     (see the 80486 manual p16-28) */
+      break;
+    }
+  return 0;
+}
diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S
new file mode 100644
index 000000000000..717785a53eb4
--- /dev/null
+++ b/arch/x86/math-emu/mul_Xsig.S
@@ -0,0 +1,176 @@
+/*---------------------------------------------------------------------------+
+ |  mul_Xsig.S                                                               |
+ |                                                                           |
+ | Multiply a 12 byte fixed point number by another fixed point number.      |
+ |                                                                           |
+ | Copyright (C) 1992,1994,1995                                              |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail billm@jacobi.maths.monash.edu.au |
+ |                                                                           |
+ | Call from C as:                                                           |
+ |   void mul32_Xsig(Xsig *x, unsigned b)                                    |
+ |                                                                           |
+ |   void mul64_Xsig(Xsig *x, unsigned long long *b)                         |
+ |                                                                           |
+ |   void mul_Xsig_Xsig(Xsig *x, unsigned *b)                                |
+ |                                                                           |
+ | The result is neither rounded nor normalized, and the ls bit or so may    |
+ | be wrong.                                                                 |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+        .file   "mul_Xsig.S"
+#include "fpu_emu.h"
+.text
+ENTRY(mul32_Xsig)
+        pushl %ebp
+        movl %esp,%ebp
+        subl $16,%esp
+        pushl %esi
+        movl PARAM1,%esi
+        movl PARAM2,%ecx
+        xor %eax,%eax
+        movl %eax,-4(%ebp)
+        movl %eax,-8(%ebp)
+        movl (%esi),%eax        /* lsl of Xsig */
+        mull %ecx               /* msl of b */
+        movl %edx,-12(%ebp)
+        movl 4(%esi),%eax       /* midl of Xsig */
+        mull %ecx               /* msl of b */
+        addl %eax,-12(%ebp)
+        adcl %edx,-8(%ebp)
+        adcl $0,-4(%ebp)
+        movl 8(%esi),%eax       /* msl of Xsig */
+        mull %ecx               /* msl of b */
+        addl %eax,-8(%ebp)
+        adcl %edx,-4(%ebp)
+        movl -12(%ebp),%eax
+        movl %eax,(%esi)
+        movl -8(%ebp),%eax
+        movl %eax,4(%esi)
+        movl -4(%ebp),%eax
+        movl %eax,8(%esi)
+        popl %esi
+        leave
+        ret
+ENTRY(mul64_Xsig)
+        pushl %ebp
+        movl %esp,%ebp
+        subl $16,%esp
+        pushl %esi
+        movl PARAM1,%esi
+        movl PARAM2,%ecx
+        xor %eax,%eax
+        movl %eax,-4(%ebp)
+        movl %eax,-8(%ebp)
+        movl (%esi),%eax        /* lsl of Xsig */
+        mull 4(%ecx)            /* msl of b */
+        movl %edx,-12(%ebp)
+        movl 4(%esi),%eax       /* midl of Xsig */
+        mull (%ecx)             /* lsl of b */
+        addl %edx,-12(%ebp)
+        adcl $0,-8(%ebp)
+        adcl $0,-4(%ebp)
+        movl 4(%esi),%eax       /* midl of Xsig */
+        mull 4(%ecx)            /* msl of b */
+        addl %eax,-12(%ebp)
+        adcl %edx,-8(%ebp)
+        adcl $0,-4(%ebp)
+        movl 8(%esi),%eax       /* msl of Xsig */
+        mull (%ecx)             /* lsl of b */
+        addl %eax,-12(%ebp)
+        adcl %edx,-8(%ebp)
+        adcl $0,-4(%ebp)
+        movl 8(%esi),%eax       /* msl of Xsig */
+        mull 4(%ecx)            /* msl of b */
+        addl %eax,-8(%ebp)
+        adcl %edx,-4(%ebp)
+        movl -12(%ebp),%eax
+        movl %eax,(%esi)
+        movl -8(%ebp),%eax
+        movl %eax,4(%esi)
+        movl -4(%ebp),%eax
+        movl %eax,8(%esi)
+        popl %esi
+        leave
+        ret
+ENTRY(mul_Xsig_Xsig)
+        pushl %ebp
+        movl %esp,%ebp
+        subl $16,%esp
+        pushl %esi
+        movl PARAM1,%esi
+        movl PARAM2,%ecx
+        xor %eax,%eax
+        movl %eax,-4(%ebp)
+        movl %eax,-8(%ebp)
+        movl (%esi),%eax        /* lsl of Xsig */
+        mull 8(%ecx)            /* msl of b */
+        movl %edx,-12(%ebp)
+        movl 4(%esi),%eax       /* midl of Xsig */
+        mull 4(%ecx)            /* midl of b */
+        addl %edx,-12(%ebp)
+        adcl $0,-8(%ebp)
+        adcl $0,-4(%ebp)
+        movl 8(%esi),%eax       /* msl of Xsig */
+        mull (%ecx)             /* lsl of b */
+        addl %edx,-12(%ebp)
+        adcl $0,-8(%ebp)
+        adcl $0,-4(%ebp)
+        movl 4(%esi),%eax       /* midl of Xsig */
+        mull 8(%ecx)            /* msl of b */
+        addl %eax,-12(%ebp)
+        adcl %edx,-8(%ebp)
+        adcl $0,-4(%ebp)
+        movl 8(%esi),%eax       /* msl of Xsig */
+        mull 4(%ecx)            /* midl of b */
+        addl %eax,-12(%ebp)
+        adcl %edx,-8(%ebp)
+        adcl $0,-4(%ebp)
+        movl 8(%esi),%eax       /* msl of Xsig */
+        mull 8(%ecx)            /* msl of b */
+        addl %eax,-8(%ebp)
+        adcl %edx,-4(%ebp)
+        movl -12(%ebp),%edx
+        movl %edx,(%esi)
+        movl -8(%ebp),%edx
+        movl %edx,4(%esi)
+        movl -4(%ebp),%edx
+        movl %edx,8(%esi)
+        popl %esi
+        leave
+        ret
diff --git a/arch/x86/math-emu/poly.h b/arch/x86/math-emu/poly.h
new file mode 100644
index 000000000000..4db798114923
--- /dev/null
+++ b/arch/x86/math-emu/poly.h
@@ -0,0 +1,121 @@
+/*---------------------------------------------------------------------------+
+ |  poly.h                                                                   |
+ |                                                                           |
+ |  Header file for the FPU-emu poly*.c source files.                        |
+ |                                                                           |
+ | Copyright (C) 1994,1999                                                   |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail   billm@melbpc.org.au            |
+ |                                                                           |
+ | Declarations and definitions for functions operating on Xsig (12-byte     |
+ | extended-significand) quantities.                                         |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#ifndef _POLY_H
+#define _POLY_H
+/* This 12-byte structure is used to improve the accuracy of computation
+   of transcendental functions.
+   Intended to be used to get results better than 8-byte computation
+   allows. 9-byte would probably be sufficient.
+   */
+typedef struct {
+  unsigned long lsw;
+  unsigned long midw;
+  unsigned long msw;
+} Xsig;
+asmlinkage void mul64(unsigned long long const *a, unsigned long long const *b,
+                      unsigned long long *result);
+asmlinkage void polynomial_Xsig(Xsig *, const unsigned long long *x,
+                                const unsigned long long terms[], const int n);
+asmlinkage void mul32_Xsig(Xsig *, const unsigned long mult);
+asmlinkage void mul64_Xsig(Xsig *, const unsigned long long *mult);
+asmlinkage void mul_Xsig_Xsig(Xsig *dest, const Xsig *mult);
+asmlinkage void shr_Xsig(Xsig *, const int n);
+asmlinkage int round_Xsig(Xsig *);
+asmlinkage int norm_Xsig(Xsig *);
+asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest);
+/* Macro to extract the most significant 32 bits from a long long */
+#define LL_MSW(x)     (((unsigned long *)&x)[1])
+/* Macro to initialize an Xsig struct */
+#define MK_XSIG(a,b,c)     { c, b, a }
+/* Macro to access the 8 ms bytes of an Xsig as a long long */
+#define XSIG_LL(x)         (*(unsigned long long *)&x.midw)
+/*
+   Need to run gcc with optimizations on to get these to
+   actually be in-line.
+   */
+/* Multiply two fixed-point 32 bit numbers, producing a 32 bit result.
+   The answer is the ms word of the product. */
+/* Some versions of gcc make it difficult to stop eax from being clobbered.
+   Merely specifying that it is used doesn't work...
+ */
+static inline unsigned long mul_32_32(const unsigned long arg1,
+                                      const unsigned long arg2)
+{
+  int retval;
+  asm volatile ("mull %2; movl %%edx,%%eax" \
+                :"=a" (retval) \
+                :"0" (arg1), "g" (arg2) \
+                :"dx");
+  return retval;
+}
+/* Add the 12 byte Xsig x2 to Xsig dest, with no checks for overflow. */
+static inline void add_Xsig_Xsig(Xsig *dest, const Xsig *x2)
+{
+  asm volatile ("movl %1,%%edi; movl %2,%%esi;\n"
+                "movl (%%esi),%%eax; addl %%eax,(%%edi);\n"
+                "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n"
+                "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n"
+                 :"=g" (*dest):"g" (dest), "g" (x2)
+                 :"ax","si","di");
+}
+/* Add the 12 byte Xsig x2 to Xsig dest, adjust exp if overflow occurs. */
+/* Note: the constraints in the asm statement didn't always work properly
+   with gcc 2.5.8.  Changing from using edi to using ecx got around the
+   problem, but keep fingers crossed! */
+static inline void add_two_Xsig(Xsig *dest, const Xsig *x2, long int *exp)
+{
+  asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n"
+                "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n"
+                "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n"
+                "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n"
+                "jnc 0f;\n"
+                "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n"
+                "movl %4,%%ecx; incl (%%ecx)\n"
+                "movl $1,%%eax; jmp 1f;\n"
+                "0: xorl %%eax,%%eax;\n"
+                "1:\n"
+                :"=g" (*exp), "=g" (*dest)
+                :"g" (dest), "g" (x2), "g" (exp)
+                :"cx","si","ax");
+}
+/* Negate (subtract from 1.0) the 12 byte Xsig */
+/* This is faster in a loop on my 386 than using the "neg" instruction. */
+static inline void negate_Xsig(Xsig *x)
+{
+  asm volatile("movl %1,%%esi;\n"
+               "xorl %%ecx,%%ecx;\n"
+               "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n"
+               "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n"
+               "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n"
+               :"=g" (*x):"g" (x):"si","ax","cx");
+}
+#endif /* _POLY_H */
diff --git a/arch/x86/math-emu/poly_2xm1.c b/arch/x86/math-emu/poly_2xm1.c
new file mode 100644
index 000000000000..9766ad5e9743
--- /dev/null
+++ b/arch/x86/math-emu/poly_2xm1.c
@@ -0,0 +1,156 @@
+/*---------------------------------------------------------------------------+
+ |  poly_2xm1.c                                                              |
+ |                                                                           |
+ | Function to compute 2^x-1 by a polynomial approximation.                  |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1997                                         |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "control_w.h"
+#include "poly.h"
+#define HIPOWER 11
+static const unsigned long long lterms[HIPOWER] =
+{
+  0x0000000000000000LL,  /* This term done separately as 12 bytes */
+  0xf5fdeffc162c7543LL,
+  0x1c6b08d704a0bfa6LL,
+  0x0276556df749cc21LL,
+  0x002bb0ffcf14f6b8LL,
+  0x0002861225ef751cLL,
+  0x00001ffcbfcd5422LL,
+  0x00000162c005d5f1LL,
+  0x0000000da96ccb1bLL,
+  0x0000000078d1b897LL,
+  0x000000000422b029LL
+};
+static const Xsig hiterm = MK_XSIG(0xb17217f7, 0xd1cf79ab, 0xc8a39194);
+/* Four slices: 0.0 : 0.25 : 0.50 : 0.75 : 1.0,
+   These numbers are 2^(1/4), 2^(1/2), and 2^(3/4)
+ */
+static const Xsig shiftterm0 = MK_XSIG(0, 0, 0);
+static const Xsig shiftterm1 = MK_XSIG(0x9837f051, 0x8db8a96f, 0x46ad2318);
+static const Xsig shiftterm2 = MK_XSIG(0xb504f333, 0xf9de6484, 0x597d89b3);
+static const Xsig shiftterm3 = MK_XSIG(0xd744fcca, 0xd69d6af4, 0x39a68bb9);
+static const Xsig *shiftterm[] = { &shiftterm0, &shiftterm1,
+                                     &shiftterm2, &shiftterm3 };
+/*--- poly_2xm1() -----------------------------------------------------------+
+ | Requires st(0) which is TAG_Valid and < 1.                                |
+ +---------------------------------------------------------------------------*/
+int     poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result)
+{
+  long int              exponent, shift;
+  unsigned long long    Xll;
+  Xsig                  accumulator, Denom, argSignif;
+  u_char                tag;
+  exponent = exponent16(arg);
+#ifdef PARANOID
+  if ( exponent >= 0 )          /* Don't want a |number| >= 1.0 */
+    {
+      /* Number negative, too large, or not Valid. */
+      EXCEPTION(EX_INTERNAL|0x127);
+      return 1;
+    }
+#endif /* PARANOID */
+  argSignif.lsw = 0;
+  XSIG_LL(argSignif) = Xll = significand(arg);
+  if ( exponent == -1 )
+    {
+      shift = (argSignif.msw & 0x40000000) ? 3 : 2;
+      /* subtract 0.5 or 0.75 */
+      exponent -= 2;
+      XSIG_LL(argSignif) <<= 2;
+      Xll <<= 2;
+    }
+  else if ( exponent == -2 )
+    {
+      shift = 1;
+      /* subtract 0.25 */
+      exponent--;
+      XSIG_LL(argSignif) <<= 1;
+      Xll <<= 1;
+    }
+  else
+    shift = 0;
+  if ( exponent < -2 )
+    {
+      /* Shift the argument right by the required places. */
+      if ( FPU_shrx(&Xll, -2-exponent) >= 0x80000000U )
+        Xll++;  /* round up */
+    }
+  accumulator.lsw = accumulator.midw = accumulator.msw = 0;
+  polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER-1);
+  mul_Xsig_Xsig(&accumulator, &argSignif);
+  shr_Xsig(&accumulator, 3);
+  mul_Xsig_Xsig(&argSignif, &hiterm);   /* The leading term */
+  add_two_Xsig(&accumulator, &argSignif, &exponent);
+  if ( shift )
+    {
+      /* The argument is large, use the identity:
+         f(x+a) = f(a) * (f(x) + 1) - 1;
+         */
+      shr_Xsig(&accumulator, - exponent);
+      accumulator.msw |= 0x80000000;      /* add 1.0 */
+      mul_Xsig_Xsig(&accumulator, shiftterm[shift]);
+      accumulator.msw &= 0x3fffffff;      /* subtract 1.0 */
+      exponent = 1;
+    }
+  if ( sign != SIGN_POS )
+    {
+      /* The argument is negative, use the identity:
+             f(-x) = -f(x) / (1 + f(x))
+         */
+      Denom.lsw = accumulator.lsw;
+      XSIG_LL(Denom) = XSIG_LL(accumulator);
+      if ( exponent < 0 )
+        shr_Xsig(&Denom, - exponent);
+      else if ( exponent > 0 )
+        {
+          /* exponent must be 1 here */
+          XSIG_LL(Denom) <<= 1;
+          if ( Denom.lsw & 0x80000000 )
+            XSIG_LL(Denom) |= 1;
+          (Denom.lsw) <<= 1;
+        }
+      Denom.msw |= 0x80000000;      /* add 1.0 */
+      div_Xsig(&accumulator, &Denom, &accumulator);
+    }
+  /* Convert to 64 bit signed-compatible */
+  exponent += round_Xsig(&accumulator);
+  result = &st(0);
+  significand(result) = XSIG_LL(accumulator);
+  setexponent16(result, exponent);
+  tag = FPU_round(result, 1, 0, FULL_PRECISION, sign);
+  setsign(result, sign);
+  FPU_settag0(tag);
+  return 0;
+}
diff --git a/arch/x86/math-emu/poly_atan.c b/arch/x86/math-emu/poly_atan.c
new file mode 100644
index 000000000000..82f702952f69
--- /dev/null
+++ b/arch/x86/math-emu/poly_atan.c
@@ -0,0 +1,229 @@
+/*---------------------------------------------------------------------------+
+ |  poly_atan.c                                                              |
+ |                                                                           |
+ | Compute the arctan of a FPU_REG, using a polynomial approximation.        |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1997                                         |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "status_w.h"
+#include "control_w.h"
+#include "poly.h"
+#define HIPOWERon       6       /* odd poly, negative terms */
+static const unsigned long long oddnegterms[HIPOWERon] =
+{
+  0x0000000000000000LL, /* Dummy (not for - 1.0) */
+  0x015328437f756467LL,
+  0x0005dda27b73dec6LL,
+  0x0000226bf2bfb91aLL,
+  0x000000ccc439c5f7LL,
+  0x0000000355438407LL
+} ;
+#define HIPOWERop       6       /* odd poly, positive terms */
+static const unsigned long long oddplterms[HIPOWERop] =
+{
+/*  0xaaaaaaaaaaaaaaabLL,  transferred to fixedpterm[] */
+  0x0db55a71875c9ac2LL,
+  0x0029fce2d67880b0LL,
+  0x0000dfd3908b4596LL,
+  0x00000550fd61dab4LL,
+  0x0000001c9422b3f9LL,
+  0x000000003e3301e1LL
+};
+static const unsigned long long denomterm = 0xebd9b842c5c53a0eLL;
+static const Xsig fixedpterm = MK_XSIG(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa);
+static const Xsig pi_signif = MK_XSIG(0xc90fdaa2, 0x2168c234, 0xc4c6628b);
+/*--- poly_atan() -----------------------------------------------------------+
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+void    poly_atan(FPU_REG *st0_ptr, u_char st0_tag,
+                  FPU_REG *st1_ptr, u_char st1_tag)
+{
+  u_char        transformed, inverted,
+                sign1, sign2;
+  int           exponent;
+  long int      dummy_exp;
+  Xsig          accumulator, Numer, Denom, accumulatore, argSignif,
+                argSq, argSqSq;
+  u_char        tag;
+  
+  sign1 = getsign(st0_ptr);
+  sign2 = getsign(st1_ptr);
+  if ( st0_tag == TAG_Valid )
+    {
+      exponent = exponent(st0_ptr);
+    }
+  else
+    {
+      /* This gives non-compatible stack contents... */
+      FPU_to_exp16(st0_ptr, st0_ptr);
+      exponent = exponent16(st0_ptr);
+    }
+  if ( st1_tag == TAG_Valid )
+    {
+      exponent -= exponent(st1_ptr);
+    }
+  else
+    {
+      /* This gives non-compatible stack contents... */
+      FPU_to_exp16(st1_ptr, st1_ptr);
+      exponent -= exponent16(st1_ptr);
+    }
+  if ( (exponent < 0) || ((exponent == 0) &&
+                          ((st0_ptr->sigh < st1_ptr->sigh) ||
+                           ((st0_ptr->sigh == st1_ptr->sigh) &&
+                            (st0_ptr->sigl < st1_ptr->sigl))) ) )
+    {
+      inverted = 1;
+      Numer.lsw = Denom.lsw = 0;
+      XSIG_LL(Numer) = significand(st0_ptr);
+      XSIG_LL(Denom) = significand(st1_ptr);
+    }
+  else
+    {
+      inverted = 0;
+      exponent = -exponent;
+      Numer.lsw = Denom.lsw = 0;
+      XSIG_LL(Numer) = significand(st1_ptr);
+      XSIG_LL(Denom) = significand(st0_ptr);
+     }
+  div_Xsig(&Numer, &Denom, &argSignif);
+  exponent += norm_Xsig(&argSignif);
+  if ( (exponent >= -1)
+      || ((exponent == -2) && (argSignif.msw > 0xd413ccd0)) )
+    {
+      /* The argument is greater than sqrt(2)-1 (=0.414213562...) */
+      /* Convert the argument by an identity for atan */
+      transformed = 1;
+      if ( exponent >= 0 )
+        {
+#ifdef PARANOID
+          if ( !( (exponent == 0) && 
+                 (argSignif.lsw == 0) && (argSignif.midw == 0) &&
+                 (argSignif.msw == 0x80000000) ) )
+            {
+              EXCEPTION(EX_INTERNAL|0x104);  /* There must be a logic error */
+              return;
+            }
+#endif /* PARANOID */
+          argSignif.msw = 0;   /* Make the transformed arg -> 0.0 */
+        }
+      else
+        {
+          Numer.lsw = Denom.lsw = argSignif.lsw;
+          XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif);
+          if ( exponent < -1 )
+            shr_Xsig(&Numer, -1-exponent);
+          negate_Xsig(&Numer);
+      
+          shr_Xsig(&Denom, -exponent);
+          Denom.msw |= 0x80000000;
+      
+          div_Xsig(&Numer, &Denom, &argSignif);
+          exponent = -1 + norm_Xsig(&argSignif);
+        }
+    }
+  else
+    {
+      transformed = 0;
+    }
+  argSq.lsw = argSignif.lsw; argSq.midw = argSignif.midw;
+  argSq.msw = argSignif.msw;
+  mul_Xsig_Xsig(&argSq, &argSq);
+  
+  argSqSq.lsw = argSq.lsw; argSqSq.midw = argSq.midw; argSqSq.msw = argSq.msw;
+  mul_Xsig_Xsig(&argSqSq, &argSqSq);
+  accumulatore.lsw = argSq.lsw;
+  XSIG_LL(accumulatore) = XSIG_LL(argSq);
+  shr_Xsig(&argSq, 2*(-1-exponent-1));
+  shr_Xsig(&argSqSq, 4*(-1-exponent-1));
+  /* Now have argSq etc with binary point at the left
+     .1xxxxxxxx */
+  /* Do the basic fixed point polynomial evaluation */
+  accumulator.msw = accumulator.midw = accumulator.lsw = 0;
+  polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq),
+                   oddplterms, HIPOWERop-1);
+  mul64_Xsig(&accumulator, &XSIG_LL(argSq));
+  negate_Xsig(&accumulator);
+  polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms, HIPOWERon-1);
+  negate_Xsig(&accumulator);
+  add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp);
+  mul64_Xsig(&accumulatore, &denomterm);
+  shr_Xsig(&accumulatore, 1 + 2*(-1-exponent));
+  accumulatore.msw |= 0x80000000;
+  div_Xsig(&accumulator, &accumulatore, &accumulator);
+  mul_Xsig_Xsig(&accumulator, &argSignif);
+  mul_Xsig_Xsig(&accumulator, &argSq);
+  shr_Xsig(&accumulator, 3);
+  negate_Xsig(&accumulator);
+  add_Xsig_Xsig(&accumulator, &argSignif);
+  if ( transformed )
+    {
+      /* compute pi/4 - accumulator */
+      shr_Xsig(&accumulator, -1-exponent);
+      negate_Xsig(&accumulator);
+      add_Xsig_Xsig(&accumulator, &pi_signif);
+      exponent = -1;
+    }
+  if ( inverted )
+    {
+      /* compute pi/2 - accumulator */
+      shr_Xsig(&accumulator, -exponent);
+      negate_Xsig(&accumulator);
+      add_Xsig_Xsig(&accumulator, &pi_signif);
+      exponent = 0;
+    }
+  if ( sign1 )
+    {
+      /* compute pi - accumulator */
+      shr_Xsig(&accumulator, 1 - exponent);
+      negate_Xsig(&accumulator);
+      add_Xsig_Xsig(&accumulator, &pi_signif);
+      exponent = 1;
+    }
+  exponent += round_Xsig(&accumulator);
+  significand(st1_ptr) = XSIG_LL(accumulator);
+  setexponent16(st1_ptr, exponent);
+  tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2);
+  FPU_settagi(1, tag);
+  set_precision_flag_up();  /* We do not really know if up or down,
+                               use this as the default. */
+}
diff --git a/arch/x86/math-emu/poly_l2.c b/arch/x86/math-emu/poly_l2.c
new file mode 100644
index 000000000000..dd00e1d5b074
--- /dev/null
+++ b/arch/x86/math-emu/poly_l2.c
@@ -0,0 +1,272 @@
+/*---------------------------------------------------------------------------+
+ |  poly_l2.c                                                                |
+ |                                                                           |
+ | Compute the base 2 log of a FPU_REG, using a polynomial approximation.    |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1997                                         |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "control_w.h"
+#include "poly.h"
+static void log2_kernel(FPU_REG const *arg, u_char argsign,
+                        Xsig *accum_result, long int *expon);
+/*--- poly_l2() -------------------------------------------------------------+
+ |   Base 2 logarithm by a polynomial approximation.                         |
+ +---------------------------------------------------------------------------*/
+void    poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign)
+{
+  long int             exponent, expon, expon_expon;
+  Xsig                 accumulator, expon_accum, yaccum;
+  u_char                       sign, argsign;
+  FPU_REG              x;
+  int                  tag;
+  exponent = exponent16(st0_ptr);
+  /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */
+  if ( st0_ptr->sigh > (unsigned)0xb504f334 )
+    {
+      /* Treat as  sqrt(2)/2 < st0_ptr < 1 */
+      significand(&x) = - significand(st0_ptr);
+      setexponent16(&x, -1);
+      exponent++;
+      argsign = SIGN_NEG;
+    }
+  else
+    {
+      /* Treat as  1 <= st0_ptr < sqrt(2) */
+      x.sigh = st0_ptr->sigh - 0x80000000;
+      x.sigl = st0_ptr->sigl;
+      setexponent16(&x, 0);
+      argsign = SIGN_POS;
+    }
+  tag = FPU_normalize_nuo(&x);
+  if ( tag == TAG_Zero )
+    {
+      expon = 0;
+      accumulator.msw = accumulator.midw = accumulator.lsw = 0;
+    }
+  else
+    {
+      log2_kernel(&x, argsign, &accumulator, &expon);
+    }
+  if ( exponent < 0 )
+    {
+      sign = SIGN_NEG;
+      exponent = -exponent;
+    }
+  else
+    sign = SIGN_POS;
+  expon_accum.msw = exponent; expon_accum.midw = expon_accum.lsw = 0;
+  if ( exponent )
+    {
+      expon_expon = 31 + norm_Xsig(&expon_accum);
+      shr_Xsig(&accumulator, expon_expon - expon);
+      if ( sign ^ argsign )
+        negate_Xsig(&accumulator);
+      add_Xsig_Xsig(&accumulator, &expon_accum);
+    }
+  else
+    {
+      expon_expon = expon;
+      sign = argsign;
+    }
+  yaccum.lsw = 0; XSIG_LL(yaccum) = significand(st1_ptr);
+  mul_Xsig_Xsig(&accumulator, &yaccum);
+  expon_expon += round_Xsig(&accumulator);
+  if ( accumulator.msw == 0 )
+    {
+      FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
+      return;
+    }
+  significand(st1_ptr) = XSIG_LL(accumulator);
+  setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1);
+  tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign);
+  FPU_settagi(1, tag);
+  set_precision_flag_up();  /* 80486 appears to always do this */
+  return;
+}
+/*--- poly_l2p1() -----------------------------------------------------------+
+ |   Base 2 logarithm by a polynomial approximation.                         |
+ |   log2(x+1)                                                               |
+ +---------------------------------------------------------------------------*/
+int     poly_l2p1(u_char sign0, u_char sign1,
+                  FPU_REG *st0_ptr, FPU_REG *st1_ptr, FPU_REG *dest)
+{
+  u_char                tag;
+  long int              exponent;
+  Xsig                  accumulator, yaccum;
+  if ( exponent16(st0_ptr) < 0 )
+    {
+      log2_kernel(st0_ptr, sign0, &accumulator, &exponent);
+      yaccum.lsw = 0;
+      XSIG_LL(yaccum) = significand(st1_ptr);
+      mul_Xsig_Xsig(&accumulator, &yaccum);
+      exponent += round_Xsig(&accumulator);
+      exponent += exponent16(st1_ptr) + 1;
+      if ( exponent < EXP_WAY_UNDER ) exponent = EXP_WAY_UNDER;
+      significand(dest) = XSIG_LL(accumulator);
+      setexponent16(dest, exponent);
+      tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1);
+      FPU_settagi(1, tag);
+      if ( tag == TAG_Valid )
+        set_precision_flag_up();   /* 80486 appears to always do this */
+    }
+  else
+    {
+      /* The magnitude of st0_ptr is far too large. */
+      if ( sign0 != SIGN_POS )
+        {
+          /* Trying to get the log of a negative number. */
+#ifdef PECULIAR_486   /* Stupid 80486 doesn't worry about log(negative). */
+          changesign(st1_ptr);
+#else
+          if ( arith_invalid(1) < 0 )
+            return 1;
+#endif /* PECULIAR_486 */
+        }
+      /* 80486 appears to do this */
+      if ( sign0 == SIGN_NEG )
+        set_precision_flag_down();
+      else
+        set_precision_flag_up();
+    }
+  if ( exponent(dest) <= EXP_UNDER )
+    EXCEPTION(EX_Underflow);
+  return 0;
+}
+#undef HIPOWER
+#define HIPOWER 10
+static const unsigned long long logterms[HIPOWER] =
+{
+  0x2a8eca5705fc2ef0LL,
+  0xf6384ee1d01febceLL,
+  0x093bb62877cdf642LL,
+  0x006985d8a9ec439bLL,
+  0x0005212c4f55a9c8LL,
+  0x00004326a16927f0LL,
+  0x0000038d1d80a0e7LL,
+  0x0000003141cc80c6LL,
+  0x00000002b1668c9fLL,
+  0x000000002c7a46aaLL
+};
+static const unsigned long leadterm = 0xb8000000;
+/*--- log2_kernel() ---------------------------------------------------------+
+ |   Base 2 logarithm by a polynomial approximation.                         |
+ |   log2(x+1)                                                               |
+ +---------------------------------------------------------------------------*/
+static void log2_kernel(FPU_REG const *arg, u_char argsign, Xsig *accum_result,
+                        long int *expon)
+{
+  long int             exponent, adj;
+  unsigned long long   Xsq;
+  Xsig                 accumulator, Numer, Denom, argSignif, arg_signif;
+  exponent = exponent16(arg);
+  Numer.lsw = Denom.lsw = 0;
+  XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg);
+  if ( argsign == SIGN_POS )
+    {
+      shr_Xsig(&Denom, 2 - (1 + exponent));
+      Denom.msw |= 0x80000000;
+      div_Xsig(&Numer, &Denom, &argSignif);
+    }
+  else
+    {
+      shr_Xsig(&Denom, 1 - (1 + exponent));
+      negate_Xsig(&Denom);
+      if ( Denom.msw & 0x80000000 )
+        {
+          div_Xsig(&Numer, &Denom, &argSignif);
+          exponent ++;
+        }
+      else
+        {
+          /* Denom must be 1.0 */
+          argSignif.lsw = Numer.lsw; argSignif.midw = Numer.midw;
+          argSignif.msw = Numer.msw;
+        }
+    }
+#ifndef PECULIAR_486
+  /* Should check here that  |local_arg|  is within the valid range */
+  if ( exponent >= -2 )
+    {
+      if ( (exponent > -2) ||
+          (argSignif.msw > (unsigned)0xafb0ccc0) )
+        {
+          /* The argument is too large */
+        }
+    }
+#endif /* PECULIAR_486 */
+  arg_signif.lsw = argSignif.lsw; XSIG_LL(arg_signif) = XSIG_LL(argSignif);
+  adj = norm_Xsig(&argSignif);
+  accumulator.lsw = argSignif.lsw; XSIG_LL(accumulator) = XSIG_LL(argSignif);
+  mul_Xsig_Xsig(&accumulator, &accumulator);
+  shr_Xsig(&accumulator, 2*(-1 - (1 + exponent + adj)));
+  Xsq = XSIG_LL(accumulator);
+  if ( accumulator.lsw & 0x80000000 )
+    Xsq++;
+  accumulator.msw = accumulator.midw = accumulator.lsw = 0;
+  /* Do the basic fixed point polynomial evaluation */
+  polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER-1);
+  mul_Xsig_Xsig(&accumulator, &argSignif);
+  shr_Xsig(&accumulator, 6 - adj);
+  mul32_Xsig(&arg_signif, leadterm);
+  add_two_Xsig(&accumulator, &arg_signif, &exponent);
+  *expon = exponent + 1;
+  accum_result->lsw = accumulator.lsw;
+  accum_result->midw = accumulator.midw;
+  accum_result->msw = accumulator.msw;
+}
diff --git a/arch/x86/math-emu/poly_sin.c b/arch/x86/math-emu/poly_sin.c
new file mode 100644
index 000000000000..a36313fb06f1
--- /dev/null
+++ b/arch/x86/math-emu/poly_sin.c
@@ -0,0 +1,397 @@
+/*---------------------------------------------------------------------------+
+ |  poly_sin.c                                                               |
+ |                                                                           |
+ |  Computation of an approximation of the sin function and the cosine       |
+ |  function by a polynomial.                                                |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1997,1999                                    |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@melbpc.org.au                             |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "control_w.h"
+#include "poly.h"
+#define N_COEFF_P       4
+#define N_COEFF_N       4
+static const unsigned long long pos_terms_l[N_COEFF_P] =
+{
+  0xaaaaaaaaaaaaaaabLL,
+  0x00d00d00d00cf906LL,
+  0x000006b99159a8bbLL,
+  0x000000000d7392e6LL
+};
+static const unsigned long long neg_terms_l[N_COEFF_N] =
+{
+  0x2222222222222167LL,
+  0x0002e3bc74aab624LL,
+  0x0000000b09229062LL,
+  0x00000000000c7973LL
+};
+#define N_COEFF_PH      4
+#define N_COEFF_NH      4
+static const unsigned long long pos_terms_h[N_COEFF_PH] =
+{
+  0x0000000000000000LL,
+  0x05b05b05b05b0406LL,
+  0x000049f93edd91a9LL,
+  0x00000000c9c9ed62LL
+};
+static const unsigned long long neg_terms_h[N_COEFF_NH] =
+{
+  0xaaaaaaaaaaaaaa98LL,
+  0x001a01a01a019064LL,
+  0x0000008f76c68a77LL,
+  0x0000000000d58f5eLL
+};
+/*--- poly_sine() -----------------------------------------------------------+
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+void    poly_sine(FPU_REG *st0_ptr)
+{
+  int                 exponent, echange;
+  Xsig                accumulator, argSqrd, argTo4;
+  unsigned long       fix_up, adj;
+  unsigned long long  fixed_arg;
+  FPU_REG             result;
+  exponent = exponent(st0_ptr);
+  accumulator.lsw = accumulator.midw = accumulator.msw = 0;
+  /* Split into two ranges, for arguments below and above 1.0 */
+  /* The boundary between upper and lower is approx 0.88309101259 */
+  if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa)) )
+    {
+      /* The argument is <= 0.88309101259 */
+      argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl; argSqrd.lsw = 0;
+      mul64_Xsig(&argSqrd, &significand(st0_ptr));
+      shr_Xsig(&argSqrd, 2*(-1-exponent));
+      argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw;
+      argTo4.lsw = argSqrd.lsw;
+      mul_Xsig_Xsig(&argTo4, &argTo4);
+      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
+                      N_COEFF_N-1);
+      mul_Xsig_Xsig(&accumulator, &argSqrd);
+      negate_Xsig(&accumulator);
+      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
+                      N_COEFF_P-1);
+      shr_Xsig(&accumulator, 2);    /* Divide by four */
+      accumulator.msw |= 0x80000000;  /* Add 1.0 */
+      mul64_Xsig(&accumulator, &significand(st0_ptr));
+      mul64_Xsig(&accumulator, &significand(st0_ptr));
+      mul64_Xsig(&accumulator, &significand(st0_ptr));
+      /* Divide by four, FPU_REG compatible, etc */
+      exponent = 3*exponent;
+      /* The minimum exponent difference is 3 */
+      shr_Xsig(&accumulator, exponent(st0_ptr) - exponent);
+      negate_Xsig(&accumulator);
+      XSIG_LL(accumulator) += significand(st0_ptr);
+      echange = round_Xsig(&accumulator);
+      setexponentpos(&result, exponent(st0_ptr) + echange);
+    }
+  else
+    {
+      /* The argument is > 0.88309101259 */
+      /* We use sin(st(0)) = cos(pi/2-st(0)) */
+      fixed_arg = significand(st0_ptr);
+      if ( exponent == 0 )
+        {
+          /* The argument is >= 1.0 */
+          /* Put the binary point at the left. */
+          fixed_arg <<= 1;
+        }
+      /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
+      fixed_arg = 0x921fb54442d18469LL - fixed_arg;
+      /* There is a special case which arises due to rounding, to fix here. */
+      if ( fixed_arg == 0xffffffffffffffffLL )
+        fixed_arg = 0;
+      XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0;
+      mul64_Xsig(&argSqrd, &fixed_arg);
+      XSIG_LL(argTo4) = XSIG_LL(argSqrd); argTo4.lsw = argSqrd.lsw;
+      mul_Xsig_Xsig(&argTo4, &argTo4);
+      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
+                      N_COEFF_NH-1);
+      mul_Xsig_Xsig(&accumulator, &argSqrd);
+      negate_Xsig(&accumulator);
+      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
+                      N_COEFF_PH-1);
+      negate_Xsig(&accumulator);
+      mul64_Xsig(&accumulator, &fixed_arg);
+      mul64_Xsig(&accumulator, &fixed_arg);
+      shr_Xsig(&accumulator, 3);
+      negate_Xsig(&accumulator);
+      add_Xsig_Xsig(&accumulator, &argSqrd);
+      shr_Xsig(&accumulator, 1);
+      accumulator.lsw |= 1;  /* A zero accumulator here would cause problems */
+      negate_Xsig(&accumulator);
+      /* The basic computation is complete. Now fix the answer to
+         compensate for the error due to the approximation used for
+         pi/2
+         */
+      /* This has an exponent of -65 */
+      fix_up = 0x898cc517;
+      /* The fix-up needs to be improved for larger args */
+      if ( argSqrd.msw & 0xffc00000 )
+        {
+          /* Get about 32 bit precision in these: */
+          fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6;
+        }
+      fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg));
+      adj = accumulator.lsw;    /* temp save */
+      accumulator.lsw -= fix_up;
+      if ( accumulator.lsw > adj )
+        XSIG_LL(accumulator) --;
+      echange = round_Xsig(&accumulator);
+      setexponentpos(&result, echange - 1);
+    }
+  significand(&result) = XSIG_LL(accumulator);
+  setsign(&result, getsign(st0_ptr));
+  FPU_copy_to_reg0(&result, TAG_Valid);
+#ifdef PARANOID
+  if ( (exponent(&result) >= 0)
+      && (significand(&result) > 0x8000000000000000LL) )
+    {
+      EXCEPTION(EX_INTERNAL|0x150);
+    }
+#endif /* PARANOID */
+}
+/*--- poly_cos() ------------------------------------------------------------+
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+void    poly_cos(FPU_REG *st0_ptr)
+{
+  FPU_REG             result;
+  long int            exponent, exp2, echange;
+  Xsig                accumulator, argSqrd, fix_up, argTo4;
+  unsigned long long  fixed_arg;
+#ifdef PARANOID
+  if ( (exponent(st0_ptr) > 0)
+      || ((exponent(st0_ptr) == 0)
+          && (significand(st0_ptr) > 0xc90fdaa22168c234LL)) )
+    {
+      EXCEPTION(EX_Invalid);
+      FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+      return;
+    }
+#endif /* PARANOID */
+  exponent = exponent(st0_ptr);
+  accumulator.lsw = accumulator.midw = accumulator.msw = 0;
+  if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54)) )
+    {
+      /* arg is < 0.687705 */
+      argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl;
+      argSqrd.lsw = 0;
+      mul64_Xsig(&argSqrd, &significand(st0_ptr));
+      if ( exponent < -1 )
+        {
+          /* shift the argument right by the required places */
+          shr_Xsig(&argSqrd, 2*(-1-exponent));
+        }
+      argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw;
+      argTo4.lsw = argSqrd.lsw;
+      mul_Xsig_Xsig(&argTo4, &argTo4);
+      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
+                      N_COEFF_NH-1);
+      mul_Xsig_Xsig(&accumulator, &argSqrd);
+      negate_Xsig(&accumulator);
+      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
+                      N_COEFF_PH-1);
+      negate_Xsig(&accumulator);
+      mul64_Xsig(&accumulator, &significand(st0_ptr));
+      mul64_Xsig(&accumulator, &significand(st0_ptr));
+      shr_Xsig(&accumulator, -2*(1+exponent));
+      shr_Xsig(&accumulator, 3);
+      negate_Xsig(&accumulator);
+      add_Xsig_Xsig(&accumulator, &argSqrd);
+      shr_Xsig(&accumulator, 1);
+      /* It doesn't matter if accumulator is all zero here, the
+         following code will work ok */
+      negate_Xsig(&accumulator);
+      if ( accumulator.lsw & 0x80000000 )
+        XSIG_LL(accumulator) ++;
+      if ( accumulator.msw == 0 )
+        {
+          /* The result is 1.0 */
+          FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+          return;
+        }
+      else
+        {
+          significand(&result) = XSIG_LL(accumulator);
+      
+          /* will be a valid positive nr with expon = -1 */
+          setexponentpos(&result, -1);
+        }
+    }
+  else
+    {
+      fixed_arg = significand(st0_ptr);
+      if ( exponent == 0 )
+        {
+          /* The argument is >= 1.0 */
+          /* Put the binary point at the left. */
+          fixed_arg <<= 1;
+        }
+      /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
+      fixed_arg = 0x921fb54442d18469LL - fixed_arg;
+      /* There is a special case which arises due to rounding, to fix here. */
+      if ( fixed_arg == 0xffffffffffffffffLL )
+        fixed_arg = 0;
+      exponent = -1;
+      exp2 = -1;
+      /* A shift is needed here only for a narrow range of arguments,
+         i.e. for fixed_arg approx 2^-32, but we pick up more... */
+      if ( !(LL_MSW(fixed_arg) & 0xffff0000) )
+        {
+          fixed_arg <<= 16;
+          exponent -= 16;
+          exp2 -= 16;
+        }
+      XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0;
+      mul64_Xsig(&argSqrd, &fixed_arg);
+      if ( exponent < -1 )
+        {
+          /* shift the argument right by the required places */
+          shr_Xsig(&argSqrd, 2*(-1-exponent));
+        }
+      argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw;
+      argTo4.lsw = argSqrd.lsw;
+      mul_Xsig_Xsig(&argTo4, &argTo4);
+      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
+                      N_COEFF_N-1);
+      mul_Xsig_Xsig(&accumulator, &argSqrd);
+      negate_Xsig(&accumulator);
+      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
+                      N_COEFF_P-1);
+      shr_Xsig(&accumulator, 2);    /* Divide by four */
+      accumulator.msw |= 0x80000000;  /* Add 1.0 */
+      mul64_Xsig(&accumulator, &fixed_arg);
+      mul64_Xsig(&accumulator, &fixed_arg);
+      mul64_Xsig(&accumulator, &fixed_arg);
+      /* Divide by four, FPU_REG compatible, etc */
+      exponent = 3*exponent;
+      /* The minimum exponent difference is 3 */
+      shr_Xsig(&accumulator, exp2 - exponent);
+      negate_Xsig(&accumulator);
+      XSIG_LL(accumulator) += fixed_arg;
+      /* The basic computation is complete. Now fix the answer to
+         compensate for the error due to the approximation used for
+         pi/2
+         */
+      /* This has an exponent of -65 */
+      XSIG_LL(fix_up) = 0x898cc51701b839a2ll;
+      fix_up.lsw = 0;
+      /* The fix-up needs to be improved for larger args */
+      if ( argSqrd.msw & 0xffc00000 )
+        {
+          /* Get about 32 bit precision in these: */
+          fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2;
+          fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24;
+        }
+      exp2 += norm_Xsig(&accumulator);
+      shr_Xsig(&accumulator, 1); /* Prevent overflow */
+      exp2++;
+      shr_Xsig(&fix_up, 65 + exp2);
+      add_Xsig_Xsig(&accumulator, &fix_up);
+      echange = round_Xsig(&accumulator);
+      setexponentpos(&result, exp2 + echange);
+      significand(&result) = XSIG_LL(accumulator);
+    }
+  FPU_copy_to_reg0(&result, TAG_Valid);
+#ifdef PARANOID
+  if ( (exponent(&result) >= 0)
+      && (significand(&result) > 0x8000000000000000LL) )
+    {
+      EXCEPTION(EX_INTERNAL|0x151);
+    }
+#endif /* PARANOID */
+}
diff --git a/arch/x86/math-emu/poly_tan.c b/arch/x86/math-emu/poly_tan.c
new file mode 100644
index 000000000000..8df3e03b6e6f
--- /dev/null
+++ b/arch/x86/math-emu/poly_tan.c
@@ -0,0 +1,222 @@
+/*---------------------------------------------------------------------------+
+ |  poly_tan.c                                                               |
+ |                                                                           |
+ | Compute the tan of a FPU_REG, using a polynomial approximation.           |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1997,1999                                    |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail   billm@melbpc.org.au            |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "control_w.h"
+#include "poly.h"
+#define HiPOWERop       3       /* odd poly, positive terms */
+static const unsigned long long oddplterm[HiPOWERop] =
+{
+  0x0000000000000000LL,
+  0x0051a1cf08fca228LL,
+  0x0000000071284ff7LL
+};
+#define HiPOWERon       2       /* odd poly, negative terms */
+static const unsigned long long oddnegterm[HiPOWERon] =
+{
+   0x1291a9a184244e80LL,
+   0x0000583245819c21LL
+};
+#define HiPOWERep       2       /* even poly, positive terms */
+static const unsigned long long evenplterm[HiPOWERep] =
+{
+  0x0e848884b539e888LL,
+  0x00003c7f18b887daLL
+};
+#define HiPOWERen       2       /* even poly, negative terms */
+static const unsigned long long evennegterm[HiPOWERen] =
+{
+  0xf1f0200fd51569ccLL,
+  0x003afb46105c4432LL
+};
+static const unsigned long long twothirds = 0xaaaaaaaaaaaaaaabLL;
+/*--- poly_tan() ------------------------------------------------------------+
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+void    poly_tan(FPU_REG *st0_ptr)
+{
+  long int              exponent;
+  int                   invert;
+  Xsig                  argSq, argSqSq, accumulatoro, accumulatore, accum,
+                        argSignif, fix_up;
+  unsigned long         adj;
+  exponent = exponent(st0_ptr);
+#ifdef PARANOID
+  if ( signnegative(st0_ptr) )  /* Can't hack a number < 0.0 */
+    { arith_invalid(0); return; }  /* Need a positive number */
+#endif /* PARANOID */
+  /* Split the problem into two domains, smaller and larger than pi/4 */
+  if ( (exponent == 0) || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2)) )
+    {
+      /* The argument is greater than (approx) pi/4 */
+      invert = 1;
+      accum.lsw = 0;
+      XSIG_LL(accum) = significand(st0_ptr);
+ 
+      if ( exponent == 0 )
+        {
+          /* The argument is >= 1.0 */
+          /* Put the binary point at the left. */
+          XSIG_LL(accum) <<= 1;
+        }
+      /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
+      XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum);
+      /* This is a special case which arises due to rounding. */
+      if ( XSIG_LL(accum) == 0xffffffffffffffffLL )
+        {
+          FPU_settag0(TAG_Valid);
+          significand(st0_ptr) = 0x8a51e04daabda360LL;
+          setexponent16(st0_ptr, (0x41 + EXTENDED_Ebias) | SIGN_Negative);
+          return;
+        }
+      argSignif.lsw = accum.lsw;
+      XSIG_LL(argSignif) = XSIG_LL(accum);
+      exponent = -1 + norm_Xsig(&argSignif);
+    }
+  else
+    {
+      invert = 0;
+      argSignif.lsw = 0;
+      XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr);
+ 
+      if ( exponent < -1 )
+        {
+          /* shift the argument right by the required places */
+          if ( FPU_shrx(&XSIG_LL(accum), -1-exponent) >= 0x80000000U )
+            XSIG_LL(accum) ++;  /* round up */
+        }
+    }
+  XSIG_LL(argSq) = XSIG_LL(accum); argSq.lsw = accum.lsw;
+  mul_Xsig_Xsig(&argSq, &argSq);
+  XSIG_LL(argSqSq) = XSIG_LL(argSq); argSqSq.lsw = argSq.lsw;
+  mul_Xsig_Xsig(&argSqSq, &argSqSq);
+  /* Compute the negative terms for the numerator polynomial */
+  accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0;
+  polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm, HiPOWERon-1);
+  mul_Xsig_Xsig(&accumulatoro, &argSq);
+  negate_Xsig(&accumulatoro);
+  /* Add the positive terms */
+  polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm, HiPOWERop-1);
+  
+  /* Compute the positive terms for the denominator polynomial */
+  accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0;
+  polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm, HiPOWERep-1);
+  mul_Xsig_Xsig(&accumulatore, &argSq);
+  negate_Xsig(&accumulatore);
+  /* Add the negative terms */
+  polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm, HiPOWERen-1);
+  /* Multiply by arg^2 */
+  mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
+  mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
+  /* de-normalize and divide by 2 */
+  shr_Xsig(&accumulatore, -2*(1+exponent) + 1);
+  negate_Xsig(&accumulatore);      /* This does 1 - accumulator */
+  /* Now find the ratio. */
+  if ( accumulatore.msw == 0 )
+    {
+      /* accumulatoro must contain 1.0 here, (actually, 0) but it
+         really doesn't matter what value we use because it will
+         have negligible effect in later calculations
+         */
+      XSIG_LL(accum) = 0x8000000000000000LL;
+      accum.lsw = 0;
+    }
+  else
+    {
+      div_Xsig(&accumulatoro, &accumulatore, &accum);
+    }
+  /* Multiply by 1/3 * arg^3 */
+  mul64_Xsig(&accum, &XSIG_LL(argSignif));
+  mul64_Xsig(&accum, &XSIG_LL(argSignif));
+  mul64_Xsig(&accum, &XSIG_LL(argSignif));
+  mul64_Xsig(&accum, &twothirds);
+  shr_Xsig(&accum, -2*(exponent+1));
+  /* tan(arg) = arg + accum */
+  add_two_Xsig(&accum, &argSignif, &exponent);
+  if ( invert )
+    {
+      /* We now have the value of tan(pi_2 - arg) where pi_2 is an
+         approximation for pi/2
+         */
+      /* The next step is to fix the answer to compensate for the
+         error due to the approximation used for pi/2
+         */
+      /* This is (approx) delta, the error in our approx for pi/2
+         (see above). It has an exponent of -65
+         */
+      XSIG_LL(fix_up) = 0x898cc51701b839a2LL;
+      fix_up.lsw = 0;
+      if ( exponent == 0 )
+        adj = 0xffffffff;   /* We want approx 1.0 here, but
+                               this is close enough. */
+      else if ( exponent > -30 )
+        {
+          adj = accum.msw >> -(exponent+1);      /* tan */
+          adj = mul_32_32(adj, adj);             /* tan^2 */
+        }
+      else
+        adj = 0;
+      adj = mul_32_32(0x898cc517, adj);          /* delta * tan^2 */
+      fix_up.msw += adj;
+      if ( !(fix_up.msw & 0x80000000) )   /* did fix_up overflow ? */
+        {
+          /* Yes, we need to add an msb */
+          shr_Xsig(&fix_up, 1);
+          fix_up.msw |= 0x80000000;
+          shr_Xsig(&fix_up, 64 + exponent);
+        }
+      else
+        shr_Xsig(&fix_up, 65 + exponent);
+      add_two_Xsig(&accum, &fix_up, &exponent);
+      /* accum now contains tan(pi/2 - arg).
+         Use tan(arg) = 1.0 / tan(pi/2 - arg)
+         */
+      accumulatoro.lsw = accumulatoro.midw = 0;
+      accumulatoro.msw = 0x80000000;
+      div_Xsig(&accumulatoro, &accum, &accum);
+      exponent = - exponent - 1;
+    }
+  /* Transfer the result */
+  round_Xsig(&accum);
+  FPU_settag0(TAG_Valid);
+  significand(st0_ptr) = XSIG_LL(accum);
+  setexponent16(st0_ptr, exponent + EXTENDED_Ebias);  /* Result is positive. */
+}
diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S
new file mode 100644
index 000000000000..17315c89ff3d
--- /dev/null
+++ b/arch/x86/math-emu/polynom_Xsig.S
@@ -0,0 +1,135 @@
+/*---------------------------------------------------------------------------+
+ |  polynomial_Xsig.S                                                        |
+ |                                                                           |
+ | Fixed point arithmetic polynomial evaluation.                             |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1995                                         |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail billm@jacobi.maths.monash.edu.au |
+ |                                                                           |
+ | Call from C as:                                                           |
+ |   void polynomial_Xsig(Xsig *accum, unsigned long long x,                 |
+ |                        unsigned long long terms[], int n)                 |
+ |                                                                           |
+ | Computes:                                                                 |
+ | terms[0] + (terms[1] + (terms[2] + ... + (terms[n-1]*x)*x)*x)*x) ... )*x  |
+ | and adds the result to the 12 byte Xsig.                                  |
+ | The terms[] are each 8 bytes, but all computation is performed to 12 byte |
+ | precision.                                                                |
+ |                                                                           |
+ | This function must be used carefully: most overflow of intermediate       |
+ | results is controlled, but overflow of the result is not.                 |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+        .file   "polynomial_Xsig.S"
+#include "fpu_emu.h"
+#define TERM_SIZE       $8
+#define SUM_MS          -20(%ebp)       /* sum ms long */
+#define SUM_MIDDLE      -24(%ebp)       /* sum middle long */
+#define SUM_LS          -28(%ebp)       /* sum ls long */
+#define ACCUM_MS        -4(%ebp)        /* accum ms long */
+#define ACCUM_MIDDLE    -8(%ebp)        /* accum middle long */
+#define ACCUM_LS        -12(%ebp)       /* accum ls long */
+#define OVERFLOWED      -16(%ebp)       /* addition overflow flag */
+.text
+ENTRY(polynomial_Xsig)
+        pushl   %ebp
+        movl    %esp,%ebp
+        subl    $32,%esp
+        pushl   %esi
+        pushl   %edi
+        pushl   %ebx
+        movl    PARAM2,%esi             /* x */
+        movl    PARAM3,%edi             /* terms */
+        movl    TERM_SIZE,%eax
+        mull    PARAM4                  /* n */
+        addl    %eax,%edi
+        movl    4(%edi),%edx            /* terms[n] */
+        movl    %edx,SUM_MS
+        movl    (%edi),%edx             /* terms[n] */
+        movl    %edx,SUM_MIDDLE
+        xor     %eax,%eax
+        movl    %eax,SUM_LS
+        movb    %al,OVERFLOWED
+        subl    TERM_SIZE,%edi
+        decl    PARAM4
+        js      L_accum_done
+L_accum_loop:
+        xor     %eax,%eax
+        movl    %eax,ACCUM_MS
+        movl    %eax,ACCUM_MIDDLE
+        movl    SUM_MIDDLE,%eax
+        mull    (%esi)                  /* x ls long */
+        movl    %edx,ACCUM_LS
+        movl    SUM_MIDDLE,%eax
+        mull    4(%esi)                 /* x ms long */
+        addl    %eax,ACCUM_LS
+        adcl    %edx,ACCUM_MIDDLE
+        adcl    $0,ACCUM_MS
+        movl    SUM_MS,%eax
+        mull    (%esi)                  /* x ls long */
+        addl    %eax,ACCUM_LS
+        adcl    %edx,ACCUM_MIDDLE
+        adcl    $0,ACCUM_MS
+        movl    SUM_MS,%eax
+        mull    4(%esi)                 /* x ms long */
+        addl    %eax,ACCUM_MIDDLE
+        adcl    %edx,ACCUM_MS
+        testb   $0xff,OVERFLOWED
+        jz      L_no_overflow
+        movl    (%esi),%eax
+        addl    %eax,ACCUM_MIDDLE
+        movl    4(%esi),%eax
+        adcl    %eax,ACCUM_MS           /* This could overflow too */
+L_no_overflow:
+/*
+ * Now put the sum of next term and the accumulator
+ * into the sum register
+ */
+        movl    ACCUM_LS,%eax
+        addl    (%edi),%eax             /* term ls long */
+        movl    %eax,SUM_LS
+        movl    ACCUM_MIDDLE,%eax
+        adcl    (%edi),%eax             /* term ls long */
+        movl    %eax,SUM_MIDDLE
+        movl    ACCUM_MS,%eax
+        adcl    4(%edi),%eax            /* term ms long */
+        movl    %eax,SUM_MS
+        sbbb    %al,%al
+        movb    %al,OVERFLOWED          /* Used in the next iteration */
+        subl    TERM_SIZE,%edi
+        decl    PARAM4
+        jns     L_accum_loop
+L_accum_done:
+        movl    PARAM1,%edi             /* accum */
+        movl    SUM_LS,%eax
+        addl    %eax,(%edi)
+        movl    SUM_MIDDLE,%eax
+        adcl    %eax,4(%edi)
+        movl    SUM_MS,%eax
+        adcl    %eax,8(%edi)
+        popl    %ebx
+        popl    %edi
+        popl    %esi
+        leave
+        ret
diff --git a/arch/x86/math-emu/reg_add_sub.c b/arch/x86/math-emu/reg_add_sub.c
new file mode 100644
index 000000000000..7cd3b37ac084
--- /dev/null
+++ b/arch/x86/math-emu/reg_add_sub.c
@@ -0,0 +1,374 @@
+/*---------------------------------------------------------------------------+
+ |  reg_add_sub.c                                                            |
+ |                                                                           |
+ | Functions to add or subtract two registers and put the result in a third. |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1997                                              |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ |  For each function, the destination may be any FPU_REG, including one of  |
+ | the source FPU_REGs.                                                      |
+ |  Each function returns 0 if the answer is o.k., otherwise a non-zero      |
+ | value is returned, indicating either an exception condition or an         |
+ | internal error.                                                           |
+ +---------------------------------------------------------------------------*/
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+#include "fpu_system.h"
+static
+int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
+                     FPU_REG const *b, u_char tagb, u_char signb,
+                     FPU_REG *dest, int deststnr, int control_w);
+/*
+  Operates on st(0) and st(n), or on st(0) and temporary data.
+  The destination must be one of the source st(x).
+  */
+int FPU_add(FPU_REG const *b, u_char tagb, int deststnr, int control_w)
+{
+  FPU_REG *a = &st(0);
+  FPU_REG *dest = &st(deststnr);
+  u_char signb = getsign(b);
+  u_char taga = FPU_gettag0();
+  u_char signa = getsign(a);
+  u_char saved_sign = getsign(dest);
+  int diff, tag, expa, expb;
+  
+  if ( !(taga | tagb) )
+    {
+      expa = exponent(a);
+      expb = exponent(b);
+    valid_add:
+      /* Both registers are valid */
+      if (!(signa ^ signb))
+        {
+          /* signs are the same */
+          tag = FPU_u_add(a, b, dest, control_w, signa, expa, expb);
+        }
+      else
+        {
+          /* The signs are different, so do a subtraction */
+          diff = expa - expb;
+          if (!diff)
+            {
+              diff = a->sigh - b->sigh;  /* This works only if the ms bits
+                                            are identical. */
+              if (!diff)
+                {
+                  diff = a->sigl > b->sigl;
+                  if (!diff)
+                    diff = -(a->sigl < b->sigl);
+                }
+            }
+      
+          if (diff > 0)
+            {
+              tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb);
+            }
+          else if ( diff < 0 )
+            {
+              tag = FPU_u_sub(b, a, dest, control_w, signb, expb, expa);
+            }
+          else
+            {
+              FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+              /* sign depends upon rounding mode */
+              setsign(dest, ((control_w & CW_RC) != RC_DOWN)
+                      ? SIGN_POS : SIGN_NEG);
+              return TAG_Zero;
+            }
+        }
+      if ( tag < 0 )
+        {
+          setsign(dest, saved_sign);
+          return tag;
+        }
+      FPU_settagi(deststnr, tag);
+      return tag;
+    }
+  if ( taga == TAG_Special )
+    taga = FPU_Special(a);
+  if ( tagb == TAG_Special )
+    tagb = FPU_Special(b);
+  if ( ((taga == TAG_Valid) && (tagb == TW_Denormal))
+            || ((taga == TW_Denormal) && (tagb == TAG_Valid))
+            || ((taga == TW_Denormal) && (tagb == TW_Denormal)) )
+    {
+      FPU_REG x, y;
+      if ( denormal_operand() < 0 )
+        return FPU_Exception;
+      FPU_to_exp16(a, &x);
+      FPU_to_exp16(b, &y);
+      a = &x;
+      b = &y;
+      expa = exponent16(a);
+      expb = exponent16(b);
+      goto valid_add;
+    }
+  if ( (taga == TW_NaN) || (tagb == TW_NaN) )
+    {
+      if ( deststnr == 0 )
+        return real_2op_NaN(b, tagb, deststnr, a);
+      else
+        return real_2op_NaN(a, taga, deststnr, a);
+    }
+  return add_sub_specials(a, taga, signa, b, tagb, signb,
+                          dest, deststnr, control_w);
+}
+/* Subtract b from a.  (a-b) -> dest */
+int FPU_sub(int flags, int rm, int control_w)
+{
+  FPU_REG const *a, *b;
+  FPU_REG *dest;
+  u_char taga, tagb, signa, signb, saved_sign, sign;
+  int diff, tag = 0, expa, expb, deststnr;
+  a = &st(0);
+  taga = FPU_gettag0();
+  deststnr = 0;
+  if ( flags & LOADED )
+    {
+      b = (FPU_REG *)rm;
+      tagb = flags & 0x0f;
+    }
+  else
+    {
+      b = &st(rm);
+      tagb = FPU_gettagi(rm);
+      if ( flags & DEST_RM )
+        deststnr = rm;
+    }
+  signa = getsign(a);
+  signb = getsign(b);
+  if ( flags & REV )
+    {
+      signa ^= SIGN_NEG;
+      signb ^= SIGN_NEG;
+    }
+  dest = &st(deststnr);
+  saved_sign = getsign(dest);
+  if ( !(taga | tagb) )
+    {
+      expa = exponent(a);
+      expb = exponent(b);
+    valid_subtract:
+      /* Both registers are valid */
+      diff = expa - expb;
+      if (!diff)
+        {
+          diff = a->sigh - b->sigh;  /* Works only if ms bits are identical */
+          if (!diff)
+            {
+              diff = a->sigl > b->sigl;
+              if (!diff)
+                diff = -(a->sigl < b->sigl);
+            }
+        }
+      switch ( (((int)signa)*2 + signb) / SIGN_NEG )
+        {
+        case 0: /* P - P */
+        case 3: /* N - N */
+          if (diff > 0)
+            {
+              /* |a| > |b| */
+              tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb);
+            }
+          else if ( diff == 0 )
+            {
+              FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+              /* sign depends upon rounding mode */
+              setsign(dest, ((control_w & CW_RC) != RC_DOWN)
+                ? SIGN_POS : SIGN_NEG);
+              return TAG_Zero;
+            }
+          else
+            {
+              sign = signa ^ SIGN_NEG;
+              tag = FPU_u_sub(b, a, dest, control_w, sign, expb, expa);
+            }
+          break;
+        case 1: /* P - N */
+          tag = FPU_u_add(a, b, dest, control_w, SIGN_POS, expa, expb);
+          break;
+        case 2: /* N - P */
+          tag = FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa, expb);
+          break;
+#ifdef PARANOID
+        default:
+          EXCEPTION(EX_INTERNAL|0x111);
+          return -1;
+#endif
+        }
+      if ( tag < 0 )
+        {
+          setsign(dest, saved_sign);
+          return tag;
+        }
+      FPU_settagi(deststnr, tag);
+      return tag;
+    }
+  if ( taga == TAG_Special )
+    taga = FPU_Special(a);
+  if ( tagb == TAG_Special )
+    tagb = FPU_Special(b);
+  if ( ((taga == TAG_Valid) && (tagb == TW_Denormal))
+            || ((taga == TW_Denormal) && (tagb == TAG_Valid))
+            || ((taga == TW_Denormal) && (tagb == TW_Denormal)) )
+    {
+      FPU_REG x, y;
+      if ( denormal_operand() < 0 )
+        return FPU_Exception;
+      FPU_to_exp16(a, &x);
+      FPU_to_exp16(b, &y);
+      a = &x;
+      b = &y;
+      expa = exponent16(a);
+      expb = exponent16(b);
+      goto valid_subtract;
+    }
+  if ( (taga == TW_NaN) || (tagb == TW_NaN) )
+    {
+      FPU_REG const *d1, *d2;
+      if ( flags & REV )
+        {
+          d1 = b;
+          d2 = a;
+        }
+      else
+        {
+          d1 = a;
+          d2 = b;
+        }
+      if ( flags & LOADED )
+        return real_2op_NaN(b, tagb, deststnr, d1);
+      if ( flags & DEST_RM )
+        return real_2op_NaN(a, taga, deststnr, d2);
+      else
+        return real_2op_NaN(b, tagb, deststnr, d2);
+    }
+    return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG,
+                            dest, deststnr, control_w);
+}
+static
+int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
+                     FPU_REG const *b, u_char tagb, u_char signb,
+                     FPU_REG *dest, int deststnr, int control_w)
+{
+  if ( ((taga == TW_Denormal) || (tagb == TW_Denormal))
+       && (denormal_operand() < 0) )
+    return FPU_Exception;
+  if (taga == TAG_Zero)
+    {
+      if (tagb == TAG_Zero)
+        {
+          /* Both are zero, result will be zero. */
+          u_char different_signs = signa ^ signb;
+          FPU_copy_to_regi(a, TAG_Zero, deststnr);
+          if ( different_signs )
+            {
+              /* Signs are different. */
+              /* Sign of answer depends upon rounding mode. */
+              setsign(dest, ((control_w & CW_RC) != RC_DOWN)
+                      ? SIGN_POS : SIGN_NEG);
+            }
+          else
+            setsign(dest, signa);  /* signa may differ from the sign of a. */
+          return TAG_Zero;
+        }
+      else
+        {
+          reg_copy(b, dest);
+          if ( (tagb == TW_Denormal) && (b->sigh & 0x80000000) )
+            {
+              /* A pseudoDenormal, convert it. */
+              addexponent(dest, 1);
+              tagb = TAG_Valid;
+            }
+          else if ( tagb > TAG_Empty )
+            tagb = TAG_Special;
+          setsign(dest, signb);  /* signb may differ from the sign of b. */
+          FPU_settagi(deststnr, tagb);
+          return tagb;
+        }
+    }
+  else if (tagb == TAG_Zero)
+    {
+      reg_copy(a, dest);
+      if ( (taga == TW_Denormal) && (a->sigh & 0x80000000) )
+        {
+          /* A pseudoDenormal */
+          addexponent(dest, 1);
+          taga = TAG_Valid;
+        }
+      else if ( taga > TAG_Empty )
+        taga = TAG_Special;
+      setsign(dest, signa);  /* signa may differ from the sign of a. */
+      FPU_settagi(deststnr, taga);
+      return taga;
+    }
+  else if (taga == TW_Infinity)
+    {
+      if ( (tagb != TW_Infinity) || (signa == signb) )
+        {
+          FPU_copy_to_regi(a, TAG_Special, deststnr);
+          setsign(dest, signa);  /* signa may differ from the sign of a. */
+          return taga;
+        }
+      /* Infinity-Infinity is undefined. */
+      return arith_invalid(deststnr);
+    }
+  else if (tagb == TW_Infinity)
+    {
+      FPU_copy_to_regi(b, TAG_Special, deststnr);
+      setsign(dest, signb);  /* signb may differ from the sign of b. */
+      return tagb;
+    }
+#ifdef PARANOID
+  EXCEPTION(EX_INTERNAL|0x101);
+#endif
+  return FPU_Exception;
+}
diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c
new file mode 100644
index 000000000000..f37c5b5a35ad
--- /dev/null
+++ b/arch/x86/math-emu/reg_compare.c
@@ -0,0 +1,381 @@
+/*---------------------------------------------------------------------------+
+ |  reg_compare.c                                                            |
+ |                                                                           |
+ | Compare two floating point registers                                      |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1997                                         |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ | compare() is the core FPU_REG comparison function                         |
+ +---------------------------------------------------------------------------*/
+#include "fpu_system.h"
+#include "exception.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+#include "status_w.h"
+static int compare(FPU_REG const *b, int tagb)
+{
+  int diff, exp0, expb;
+  u_char                st0_tag;
+  FPU_REG       *st0_ptr;
+  FPU_REG       x, y;
+  u_char                st0_sign, signb = getsign(b);
+  st0_ptr = &st(0);
+  st0_tag = FPU_gettag0();
+  st0_sign = getsign(st0_ptr);
+  if ( tagb == TAG_Special )
+    tagb = FPU_Special(b);
+  if ( st0_tag == TAG_Special )
+    st0_tag = FPU_Special(st0_ptr);
+  if ( ((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal))
+       || ((tagb != TAG_Valid) && (tagb != TW_Denormal)) )
+    {
+      if ( st0_tag == TAG_Zero )
+        {
+          if ( tagb == TAG_Zero ) return COMP_A_eq_B;
+          if ( tagb == TAG_Valid )
+            return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
+          if ( tagb == TW_Denormal )
+            return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
+            | COMP_Denormal;
+        }
+      else if ( tagb == TAG_Zero )
+        {
+          if ( st0_tag == TAG_Valid )
+            return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
+          if ( st0_tag == TW_Denormal )
+            return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
+            | COMP_Denormal;
+        }
+      if ( st0_tag == TW_Infinity )
+        {
+          if ( (tagb == TAG_Valid) || (tagb == TAG_Zero) )
+            return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
+          else if ( tagb == TW_Denormal )
+            return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
+              | COMP_Denormal;
+          else if ( tagb == TW_Infinity )
+            {
+              /* The 80486 book says that infinities can be equal! */
+              return (st0_sign == signb) ? COMP_A_eq_B :
+                ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
+            }
+          /* Fall through to the NaN code */
+        }
+      else if ( tagb == TW_Infinity )
+        {
+          if ( (st0_tag == TAG_Valid) || (st0_tag == TAG_Zero) )
+            return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
+          if ( st0_tag == TW_Denormal )
+            return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
+                | COMP_Denormal;
+          /* Fall through to the NaN code */
+        }
+      /* The only possibility now should be that one of the arguments
+         is a NaN */
+      if ( (st0_tag == TW_NaN) || (tagb == TW_NaN) )
+        {
+          int signalling = 0, unsupported = 0;
+          if ( st0_tag == TW_NaN )
+            {
+              signalling = (st0_ptr->sigh & 0xc0000000) == 0x80000000;
+              unsupported = !((exponent(st0_ptr) == EXP_OVER)
+                              && (st0_ptr->sigh & 0x80000000));
+            }
+          if ( tagb == TW_NaN )
+            {
+              signalling |= (b->sigh & 0xc0000000) == 0x80000000;
+              unsupported |= !((exponent(b) == EXP_OVER)
+                               && (b->sigh & 0x80000000));
+            }
+          if ( signalling || unsupported )
+            return COMP_No_Comp | COMP_SNaN | COMP_NaN;
+          else
+            /* Neither is a signaling NaN */
+            return COMP_No_Comp | COMP_NaN;
+        }
+      
+      EXCEPTION(EX_Invalid);
+    }
+  
+  if (st0_sign != signb)
+    {
+      return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
+        | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
+            COMP_Denormal : 0);
+    }
+  if ( (st0_tag == TW_Denormal) || (tagb == TW_Denormal) )
+    {
+      FPU_to_exp16(st0_ptr, &x);
+      FPU_to_exp16(b, &y);
+      st0_ptr = &x;
+      b = &y;
+      exp0 = exponent16(st0_ptr);
+      expb = exponent16(b);
+    }
+  else
+    {
+      exp0 = exponent(st0_ptr);
+      expb = exponent(b);
+    }
+#ifdef PARANOID
+  if (!(st0_ptr->sigh & 0x80000000)) EXCEPTION(EX_Invalid);
+  if (!(b->sigh & 0x80000000)) EXCEPTION(EX_Invalid);
+#endif /* PARANOID */
+  diff = exp0 - expb;
+  if ( diff == 0 )
+    {
+      diff = st0_ptr->sigh - b->sigh;  /* Works only if ms bits are
+                                              identical */
+      if ( diff == 0 )
+        {
+        diff = st0_ptr->sigl > b->sigl;
+        if ( diff == 0 )
+          diff = -(st0_ptr->sigl < b->sigl);
+        }
+    }
+  if ( diff > 0 )
+    {
+      return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
+        | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
+            COMP_Denormal : 0);
+    }
+  if ( diff < 0 )
+    {
+      return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
+        | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
+            COMP_Denormal : 0);
+    }
+  return COMP_A_eq_B
+    | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
+        COMP_Denormal : 0);
+}
+/* This function requires that st(0) is not empty */
+int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag)
+{
+  int f = 0, c;
+  c = compare(loaded_data, loaded_tag);
+  if (c & COMP_NaN)
+    {
+      EXCEPTION(EX_Invalid);
+      f = SW_C3 | SW_C2 | SW_C0;
+    }
+  else
+    switch (c & 7)
+      {
+      case COMP_A_lt_B:
+        f = SW_C0;
+        break;
+      case COMP_A_eq_B:
+        f = SW_C3;
+        break;
+      case COMP_A_gt_B:
+        f = 0;
+        break;
+      case COMP_No_Comp:
+        f = SW_C3 | SW_C2 | SW_C0;
+        break;
+#ifdef PARANOID
+      default:
+        EXCEPTION(EX_INTERNAL|0x121);
+        f = SW_C3 | SW_C2 | SW_C0;
+        break;
+#endif /* PARANOID */
+      }
+  setcc(f);
+  if (c & COMP_Denormal)
+    {
+      return denormal_operand() < 0;
+    }
+  return 0;
+}
+static int compare_st_st(int nr)
+{
+  int f = 0, c;
+  FPU_REG *st_ptr;
+  if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) )
+    {
+      setcc(SW_C3 | SW_C2 | SW_C0);
+      /* Stack fault */
+      EXCEPTION(EX_StackUnder);
+      return !(control_word & CW_Invalid);
+    }
+  st_ptr = &st(nr);
+  c = compare(st_ptr, FPU_gettagi(nr));
+  if (c & COMP_NaN)
+    {
+      setcc(SW_C3 | SW_C2 | SW_C0);
+      EXCEPTION(EX_Invalid);
+      return !(control_word & CW_Invalid);
+    }
+  else
+    switch (c & 7)
+      {
+      case COMP_A_lt_B:
+        f = SW_C0;
+        break;
+      case COMP_A_eq_B:
+        f = SW_C3;
+        break;
+      case COMP_A_gt_B:
+        f = 0;
+        break;
+      case COMP_No_Comp:
+        f = SW_C3 | SW_C2 | SW_C0;
+        break;
+#ifdef PARANOID
+      default:
+        EXCEPTION(EX_INTERNAL|0x122);
+        f = SW_C3 | SW_C2 | SW_C0;
+        break;
+#endif /* PARANOID */
+      }
+  setcc(f);
+  if (c & COMP_Denormal)
+    {
+      return denormal_operand() < 0;
+    }
+  return 0;
+}
+static int compare_u_st_st(int nr)
+{
+  int f = 0, c;
+  FPU_REG *st_ptr;
+  if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) )
+    {
+      setcc(SW_C3 | SW_C2 | SW_C0);
+      /* Stack fault */
+      EXCEPTION(EX_StackUnder);
+      return !(control_word & CW_Invalid);
+    }
+  st_ptr = &st(nr);
+  c = compare(st_ptr, FPU_gettagi(nr));
+  if (c & COMP_NaN)
+    {
+      setcc(SW_C3 | SW_C2 | SW_C0);
+      if (c & COMP_SNaN)       /* This is the only difference between
+                                  un-ordered and ordinary comparisons */
+        {
+          EXCEPTION(EX_Invalid);
+          return !(control_word & CW_Invalid);
+        }
+      return 0;
+    }
+  else
+    switch (c & 7)
+      {
+      case COMP_A_lt_B:
+        f = SW_C0;
+        break;
+      case COMP_A_eq_B:
+        f = SW_C3;
+        break;
+      case COMP_A_gt_B:
+        f = 0;
+        break;
+      case COMP_No_Comp:
+        f = SW_C3 | SW_C2 | SW_C0;
+        break;
+#ifdef PARANOID
+      default:
+        EXCEPTION(EX_INTERNAL|0x123);
+        f = SW_C3 | SW_C2 | SW_C0;
+        break;
+#endif /* PARANOID */ 
+      }
+  setcc(f);
+  if (c & COMP_Denormal)
+    {
+      return denormal_operand() < 0;
+    }
+  return 0;
+}
+/*---------------------------------------------------------------------------*/
+void fcom_st(void)
+{
+  /* fcom st(i) */
+  compare_st_st(FPU_rm);
+}
+void fcompst(void)
+{
+  /* fcomp st(i) */
+  if ( !compare_st_st(FPU_rm) )
+    FPU_pop();
+}
+void fcompp(void)
+{
+  /* fcompp */
+  if (FPU_rm != 1)
+    {
+      FPU_illegal();
+      return;
+    }
+  if ( !compare_st_st(1) )
+      poppop();
+}
+void fucom_(void)
+{
+  /* fucom st(i) */
+  compare_u_st_st(FPU_rm);
+}
+void fucomp(void)
+{
+  /* fucomp st(i) */
+  if ( !compare_u_st_st(FPU_rm) )
+    FPU_pop();
+}
+void fucompp(void)
+{
+  /* fucompp */
+  if (FPU_rm == 1)
+    {
+      if ( !compare_u_st_st(1) )
+        poppop();
+    }
+  else
+    FPU_illegal();
+}
diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c
new file mode 100644
index 000000000000..a85015801969
--- /dev/null
+++ b/arch/x86/math-emu/reg_constant.c
@@ -0,0 +1,120 @@
+/*---------------------------------------------------------------------------+
+ |  reg_constant.c                                                           |
+ |                                                                           |
+ | All of the constant FPU_REGs                                              |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1997                                         |
+ |                     W. Metzenthen, 22 Parker St, Ormond, Vic 3163,        |
+ |                     Australia.  E-mail   billm@suburbia.net               |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "fpu_system.h"
+#include "fpu_emu.h"
+#include "status_w.h"
+#include "reg_constant.h"
+#include "control_w.h"
+#define MAKE_REG(s,e,l,h) { l, h, \
+                            ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) }
+FPU_REG const CONST_1    = MAKE_REG(POS, 0, 0x00000000, 0x80000000);
+#if 0
+FPU_REG const CONST_2    = MAKE_REG(POS, 1, 0x00000000, 0x80000000);
+FPU_REG const CONST_HALF = MAKE_REG(POS, -1, 0x00000000, 0x80000000);
+#endif  /*  0  */
+static FPU_REG const CONST_L2T  = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b);
+static FPU_REG const CONST_L2E  = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29);
+FPU_REG const CONST_PI   = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2);
+FPU_REG const CONST_PI2  = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2);
+FPU_REG const CONST_PI4  = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2);
+static FPU_REG const CONST_LG2  = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84);
+static FPU_REG const CONST_LN2  = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7);
+/* Extra bits to take pi/2 to more than 128 bits precision. */
+FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66,
+                                         0xfc8f8cbb, 0xece675d1);
+/* Only the sign (and tag) is used in internal zeroes */
+FPU_REG const CONST_Z    = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0);
+/* Only the sign and significand (and tag) are used in internal NaNs */
+/* The 80486 never generates one of these 
+FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000);
+ */
+/* This is the real indefinite QNaN */
+FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000);
+/* Only the sign (and tag) is used in internal infinities */
+FPU_REG const CONST_INF  = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000);
+static void fld_const(FPU_REG const *c, int adj, u_char tag)
+{
+  FPU_REG *st_new_ptr;
+  if ( STACK_OVERFLOW )
+    {
+      FPU_stack_overflow();
+      return;
+    }
+  push();
+  reg_copy(c, st_new_ptr);
+  st_new_ptr->sigl += adj;  /* For all our fldxxx constants, we don't need to
+                               borrow or carry. */
+  FPU_settag0(tag);
+  clear_C1();
+}
+/* A fast way to find out whether x is one of RC_DOWN or RC_CHOP
+   (and not one of RC_RND or RC_UP).
+   */
+#define DOWN_OR_CHOP(x)  (x & RC_DOWN)
+static void fld1(int rc)
+{
+  fld_const(&CONST_1, 0, TAG_Valid);
+}
+static void fldl2t(int rc)
+{
+  fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid);
+}
+static void fldl2e(int rc)
+{
+  fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
+}
+static void fldpi(int rc)
+{
+  fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
+}
+static void fldlg2(int rc)
+{
+  fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
+}
+static void fldln2(int rc)
+{
+  fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
+}
+static void fldz(int rc)
+{
+  fld_const(&CONST_Z, 0, TAG_Zero);
+}
+typedef void (*FUNC_RC)(int);
+static FUNC_RC constants_table[] = {
+  fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC)FPU_illegal
+};
+void fconst(void)
+{
+  (constants_table[FPU_rm])(control_word & CW_RC);
+}
diff --git a/arch/x86/math-emu/reg_constant.h b/arch/x86/math-emu/reg_constant.h
new file mode 100644
index 000000000000..1bffaec3a134
--- /dev/null
+++ b/arch/x86/math-emu/reg_constant.h
@@ -0,0 +1,25 @@
+/*---------------------------------------------------------------------------+
+ |  reg_constant.h                                                           |
+ |                                                                           |
+ | Copyright (C) 1992    W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail   billm@vaxc.cc.monash.edu.au    |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#ifndef _REG_CONSTANT_H_
+#define _REG_CONSTANT_H_
+#include "fpu_emu.h"
+extern FPU_REG const CONST_1;
+extern FPU_REG const CONST_PI;
+extern FPU_REG const CONST_PI2;
+extern FPU_REG const CONST_PI2extra;
+extern FPU_REG const CONST_PI4;
+extern FPU_REG const CONST_Z;
+extern FPU_REG const CONST_PINF;
+extern FPU_REG const CONST_INF;
+extern FPU_REG const CONST_MINF;
+extern FPU_REG const CONST_QNaN;
+#endif /* _REG_CONSTANT_H_ */
diff --git a/arch/x86/math-emu/reg_convert.c b/arch/x86/math-emu/reg_convert.c
new file mode 100644
index 000000000000..45a258752703
--- /dev/null
+++ b/arch/x86/math-emu/reg_convert.c
@@ -0,0 +1,53 @@
+/*---------------------------------------------------------------------------+
+ |  reg_convert.c                                                            |
+ |                                                                           |
+ |  Convert register representation.                                         |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1996,1997                                    |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "exception.h"
+#include "fpu_emu.h"
+int FPU_to_exp16(FPU_REG const *a, FPU_REG *x)
+{
+  int sign = getsign(a);
+  *(long long *)&(x->sigl) = *(const long long *)&(a->sigl);
+  /* Set up the exponent as a 16 bit quantity. */
+  setexponent16(x, exponent(a));
+  if ( exponent16(x) == EXP_UNDER )
+    {
+      /* The number is a de-normal or pseudodenormal. */
+      /* We only deal with the significand and exponent. */
+      if (x->sigh & 0x80000000)
+        {
+          /* Is a pseudodenormal. */
+          /* This is non-80486 behaviour because the number
+             loses its 'denormal' identity. */
+          addexponent(x, 1);
+        }
+      else
+        {
+          /* Is a denormal. */
+          addexponent(x, 1);
+          FPU_normalize_nuo(x);
+        }
+    }
+  if ( !(x->sigh & 0x80000000) )
+    {
+      EXCEPTION(EX_INTERNAL | 0x180);
+    }
+  return sign;
+}
diff --git a/arch/x86/math-emu/reg_divide.c b/arch/x86/math-emu/reg_divide.c
new file mode 100644
index 000000000000..5cee7ff920d9
--- /dev/null
+++ b/arch/x86/math-emu/reg_divide.c
@@ -0,0 +1,207 @@
+/*---------------------------------------------------------------------------+
+ |  reg_divide.c                                                             |
+ |                                                                           |
+ | Divide one FPU_REG by another and put the result in a destination FPU_REG.|
+ |                                                                           |
+ | Copyright (C) 1996                                                        |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@jacobi.maths.monash.edu.au                |
+ |                                                                           |
+ |    Return value is the tag of the answer, or-ed with FPU_Exception if     |
+ |    one was raised, or -1 on internal error.                               |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ | The destination may be any FPU_REG, including one of the source FPU_REGs. |
+ +---------------------------------------------------------------------------*/
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "fpu_system.h"
+/*
+  Divide one register by another and put the result into a third register.
+  */
+int FPU_div(int flags, int rm, int control_w)
+{
+  FPU_REG x, y;
+  FPU_REG const *a, *b, *st0_ptr, *st_ptr;
+  FPU_REG *dest;
+  u_char taga, tagb, signa, signb, sign, saved_sign;
+  int tag, deststnr;
+  if ( flags & DEST_RM )
+    deststnr = rm;
+  else
+    deststnr = 0;
+  if ( flags & REV )
+    {
+      b = &st(0);
+      st0_ptr = b;
+      tagb = FPU_gettag0();
+      if ( flags & LOADED )
+        {
+          a = (FPU_REG *)rm;
+          taga = flags & 0x0f;
+        }
+      else
+        {
+          a = &st(rm);
+          st_ptr = a;
+          taga = FPU_gettagi(rm);
+        }
+    }
+  else
+    {
+      a = &st(0);
+      st0_ptr = a;
+      taga = FPU_gettag0();
+      if ( flags & LOADED )
+        {
+          b = (FPU_REG *)rm;
+          tagb = flags & 0x0f;
+        }
+      else
+        {
+          b = &st(rm);
+          st_ptr = b;
+          tagb = FPU_gettagi(rm);
+        }
+    }
+  signa = getsign(a);
+  signb = getsign(b);
+  sign = signa ^ signb;
+  dest = &st(deststnr);
+  saved_sign = getsign(dest);
+  if ( !(taga | tagb) )
+    {
+      /* Both regs Valid, this should be the most common case. */
+      reg_copy(a, &x);
+      reg_copy(b, &y);
+      setpositive(&x);
+      setpositive(&y);
+      tag = FPU_u_div(&x, &y, dest, control_w, sign);
+      if ( tag < 0 )
+        return tag;
+      FPU_settagi(deststnr, tag);
+      return tag;
+    }
+  if ( taga == TAG_Special )
+    taga = FPU_Special(a);
+  if ( tagb == TAG_Special )
+    tagb = FPU_Special(b);
+  if ( ((taga == TAG_Valid) && (tagb == TW_Denormal))
+            || ((taga == TW_Denormal) && (tagb == TAG_Valid))
+            || ((taga == TW_Denormal) && (tagb == TW_Denormal)) )
+    {
+      if ( denormal_operand() < 0 )
+        return FPU_Exception;
+      FPU_to_exp16(a, &x);
+      FPU_to_exp16(b, &y);
+      tag = FPU_u_div(&x, &y, dest, control_w, sign);
+      if ( tag < 0 )
+        return tag;
+      FPU_settagi(deststnr, tag);
+      return tag;
+    }
+  else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) )
+    {
+      if ( tagb != TAG_Zero )
+        {
+          /* Want to find Zero/Valid */
+          if ( tagb == TW_Denormal )
+            {
+              if ( denormal_operand() < 0 )
+                return FPU_Exception;
+            }
+          /* The result is zero. */
+          FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+          setsign(dest, sign);
+          return TAG_Zero;
+        }
+      /* We have an exception condition, either 0/0 or Valid/Zero. */
+      if ( taga == TAG_Zero )
+        {
+          /* 0/0 */
+          return arith_invalid(deststnr);
+        }
+      /* Valid/Zero */
+      return FPU_divide_by_zero(deststnr, sign);
+    }
+  /* Must have infinities, NaNs, etc */
+  else if ( (taga == TW_NaN) || (tagb == TW_NaN) )
+    {
+      if ( flags & LOADED )
+        return real_2op_NaN((FPU_REG *)rm, flags & 0x0f, 0, st0_ptr);
+      if ( flags & DEST_RM )
+        {
+          int tag;
+          tag = FPU_gettag0();
+          if ( tag == TAG_Special )
+            tag = FPU_Special(st0_ptr);
+          return real_2op_NaN(st0_ptr, tag, rm, (flags & REV) ? st0_ptr : &st(rm));
+        }
+      else
+        {
+          int tag;
+          tag = FPU_gettagi(rm);
+          if ( tag == TAG_Special )
+            tag = FPU_Special(&st(rm));
+          return real_2op_NaN(&st(rm), tag, 0, (flags & REV) ? st0_ptr : &st(rm));
+        }
+    }
+  else if (taga == TW_Infinity)
+    {
+      if (tagb == TW_Infinity)
+        {
+          /* infinity/infinity */
+          return arith_invalid(deststnr);
+        }
+      else
+        {
+          /* tagb must be Valid or Zero */
+          if ( (tagb == TW_Denormal) && (denormal_operand() < 0) )
+            return FPU_Exception;
+          
+          /* Infinity divided by Zero or Valid does
+             not raise and exception, but returns Infinity */
+          FPU_copy_to_regi(a, TAG_Special, deststnr);
+          setsign(dest, sign);
+          return taga;
+        }
+    }
+  else if (tagb == TW_Infinity)
+    {
+      if ( (taga == TW_Denormal) && (denormal_operand() < 0) )
+        return FPU_Exception;
+      /* The result is zero. */
+      FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+      setsign(dest, sign);
+      return TAG_Zero;
+    }
+#ifdef PARANOID
+  else
+    {
+      EXCEPTION(EX_INTERNAL|0x102);
+      return FPU_Exception;
+    }
+#endif /* PARANOID */ 
+        return 0;
+}
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
new file mode 100644
index 000000000000..e976caef6498
--- /dev/null
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -0,0 +1,1375 @@
+/*---------------------------------------------------------------------------+
+ |  reg_ld_str.c                                                             |
+ |                                                                           |
+ | All of the functions which transfer data between user memory and FPU_REGs.|
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1996,1997                                    |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ | Note:                                                                     |
+ |    The file contains code which accesses user memory.                     |
+ |    Emulator static data may change when user memory is accessed, due to   |
+ |    other processes using the emulator while swapping is in progress.      |
+ +---------------------------------------------------------------------------*/
+#include "fpu_emu.h"
+#include <asm/uaccess.h>
+#include "fpu_system.h"
+#include "exception.h"
+#include "reg_constant.h"
+#include "control_w.h"
+#include "status_w.h"
+#define DOUBLE_Emax 1023         /* largest valid exponent */
+#define DOUBLE_Ebias 1023
+#define DOUBLE_Emin (-1022)      /* smallest valid exponent */
+#define SINGLE_Emax 127          /* largest valid exponent */
+#define SINGLE_Ebias 127
+#define SINGLE_Emin (-126)       /* smallest valid exponent */
+static u_char normalize_no_excep(FPU_REG *r, int exp, int sign)
+{
+  u_char tag;
+  setexponent16(r, exp);
+  tag = FPU_normalize_nuo(r);
+  stdexp(r);
+  if ( sign )
+    setnegative(r);
+  return tag;
+}
+int FPU_tagof(FPU_REG *ptr)
+{
+  int exp;
+  exp = exponent16(ptr) & 0x7fff;
+  if ( exp == 0 )
+    {
+      if ( !(ptr->sigh | ptr->sigl) )
+        {
+          return TAG_Zero;
+        }
+      /* The number is a de-normal or pseudodenormal. */
+      return TAG_Special;
+    }
+  if ( exp == 0x7fff )
+    {
+      /* Is an Infinity, a NaN, or an unsupported data type. */
+      return TAG_Special;
+    }
+  if ( !(ptr->sigh & 0x80000000) )
+    {
+      /* Unsupported data type. */
+      /* Valid numbers have the ms bit set to 1. */
+      /* Unnormal. */
+      return TAG_Special;
+    }
+  return TAG_Valid;
+}
+/* Get a long double from user memory */
+int FPU_load_extended(long double __user *s, int stnr)
+{
+  FPU_REG *sti_ptr = &st(stnr);
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_READ, s, 10);
+  __copy_from_user(sti_ptr, s, 10);
+  RE_ENTRANT_CHECK_ON;
+  return FPU_tagof(sti_ptr);
+}
+/* Get a double from user memory */
+int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data)
+{
+  int exp, tag, negative;
+  unsigned m64, l64;
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_READ, dfloat, 8);
+  FPU_get_user(m64, 1 + (unsigned long __user *) dfloat);
+  FPU_get_user(l64, (unsigned long __user *) dfloat);
+  RE_ENTRANT_CHECK_ON;
+  negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
+  exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias;
+  m64 &= 0xfffff;
+  if ( exp > DOUBLE_Emax + EXTENDED_Ebias )
+    {
+      /* Infinity or NaN */
+      if ((m64 == 0) && (l64 == 0))
+        {
+          /* +- infinity */
+          loaded_data->sigh = 0x80000000;
+          loaded_data->sigl = 0x00000000;
+          exp = EXP_Infinity + EXTENDED_Ebias;
+          tag = TAG_Special;
+        }
+      else
+        {
+          /* Must be a signaling or quiet NaN */
+          exp = EXP_NaN + EXTENDED_Ebias;
+          loaded_data->sigh = (m64 << 11) | 0x80000000;
+          loaded_data->sigh |= l64 >> 21;
+          loaded_data->sigl = l64 << 11;
+          tag = TAG_Special;    /* The calling function must look for NaNs */
+        }
+    }
+  else if ( exp < DOUBLE_Emin + EXTENDED_Ebias )
+    {
+      /* Zero or de-normal */
+      if ((m64 == 0) && (l64 == 0))
+        {
+          /* Zero */
+          reg_copy(&CONST_Z, loaded_data);
+          exp = 0;
+          tag = TAG_Zero;
+        }
+      else
+        {
+          /* De-normal */
+          loaded_data->sigh = m64 << 11;
+          loaded_data->sigh |= l64 >> 21;
+          loaded_data->sigl = l64 << 11;
+          return normalize_no_excep(loaded_data, DOUBLE_Emin, negative)
+            | (denormal_operand() < 0 ? FPU_Exception : 0);
+        }
+    }
+  else
+    {
+      loaded_data->sigh = (m64 << 11) | 0x80000000;
+      loaded_data->sigh |= l64 >> 21;
+      loaded_data->sigl = l64 << 11;
+      tag = TAG_Valid;
+    }
+  setexponent16(loaded_data, exp | negative);
+  return tag;
+}
+/* Get a float from user memory */
+int FPU_load_single(float __user *single, FPU_REG *loaded_data)
+{
+  unsigned m32;
+  int exp, tag, negative;
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_READ, single, 4);
+  FPU_get_user(m32, (unsigned long __user *) single);
+  RE_ENTRANT_CHECK_ON;
+  negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
+  if (!(m32 & 0x7fffffff))
+    {
+      /* Zero */
+      reg_copy(&CONST_Z, loaded_data);
+      addexponent(loaded_data, negative);
+      return TAG_Zero;
+    }
+  exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias;
+  m32 = (m32 & 0x7fffff) << 8;
+  if ( exp < SINGLE_Emin + EXTENDED_Ebias )
+    {
+      /* De-normals */
+      loaded_data->sigh = m32;
+      loaded_data->sigl = 0;
+      return normalize_no_excep(loaded_data, SINGLE_Emin, negative)
+        | (denormal_operand() < 0 ? FPU_Exception : 0);
+    }
+  else if ( exp > SINGLE_Emax + EXTENDED_Ebias )
+    {
+    /* Infinity or NaN */
+      if ( m32 == 0 )
+        {
+          /* +- infinity */
+          loaded_data->sigh = 0x80000000;
+          loaded_data->sigl = 0x00000000;
+          exp = EXP_Infinity + EXTENDED_Ebias;
+          tag = TAG_Special;
+        }
+      else
+        {
+          /* Must be a signaling or quiet NaN */
+          exp = EXP_NaN + EXTENDED_Ebias;
+          loaded_data->sigh = m32 | 0x80000000;
+          loaded_data->sigl = 0;
+          tag = TAG_Special;  /* The calling function must look for NaNs */
+        }
+    }
+  else
+    {
+      loaded_data->sigh = m32 | 0x80000000;
+      loaded_data->sigl = 0;
+      tag = TAG_Valid;
+    }
+  setexponent16(loaded_data, exp | negative);  /* Set the sign. */
+  return tag;
+}
+/* Get a long long from user memory */
+int FPU_load_int64(long long __user *_s)
+{
+  long long s;
+  int sign;
+  FPU_REG *st0_ptr = &st(0);
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_READ, _s, 8);
+  if (copy_from_user(&s,_s,8))
+    FPU_abort;
+  RE_ENTRANT_CHECK_ON;
+  if (s == 0)
+    {
+      reg_copy(&CONST_Z, st0_ptr);
+      return TAG_Zero;
+    }
+  if (s > 0)
+    sign = SIGN_Positive;
+  else
+  {
+    s = -s;
+    sign = SIGN_Negative;
+  }
+  significand(st0_ptr) = s;
+  return normalize_no_excep(st0_ptr, 63, sign);
+}
+/* Get a long from user memory */
+int FPU_load_int32(long __user *_s, FPU_REG *loaded_data)
+{
+  long s;
+  int negative;
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_READ, _s, 4);
+  FPU_get_user(s, _s);
+  RE_ENTRANT_CHECK_ON;
+  if (s == 0)
+    { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; }
+  if (s > 0)
+    negative = SIGN_Positive;
+  else
+    {
+      s = -s;
+      negative = SIGN_Negative;
+    }
+  loaded_data->sigh = s;
+  loaded_data->sigl = 0;
+  return normalize_no_excep(loaded_data, 31, negative);
+}
+/* Get a short from user memory */
+int FPU_load_int16(short __user *_s, FPU_REG *loaded_data)
+{
+  int s, negative;
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_READ, _s, 2);
+  /* Cast as short to get the sign extended. */
+  FPU_get_user(s, _s);
+  RE_ENTRANT_CHECK_ON;
+  if (s == 0)
+    { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; }
+  if (s > 0)
+    negative = SIGN_Positive;
+  else
+    {
+      s = -s;
+      negative = SIGN_Negative;
+    }
+  loaded_data->sigh = s << 16;
+  loaded_data->sigl = 0;
+  return normalize_no_excep(loaded_data, 15, negative);
+}
+/* Get a packed bcd array from user memory */
+int FPU_load_bcd(u_char __user *s)
+{
+  FPU_REG *st0_ptr = &st(0);
+  int pos;
+  u_char bcd;
+  long long l=0;
+  int sign;
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_READ, s, 10);
+  RE_ENTRANT_CHECK_ON;
+  for ( pos = 8; pos >= 0; pos--)
+    {
+      l *= 10;
+      RE_ENTRANT_CHECK_OFF;
+      FPU_get_user(bcd, s+pos);
+      RE_ENTRANT_CHECK_ON;
+      l += bcd >> 4;
+      l *= 10;
+      l += bcd & 0x0f;
+    }
+ 
+  RE_ENTRANT_CHECK_OFF;
+  FPU_get_user(sign, s+9);
+  sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive;
+  RE_ENTRANT_CHECK_ON;
+  if ( l == 0 )
+    {
+      reg_copy(&CONST_Z, st0_ptr);
+      addexponent(st0_ptr, sign);   /* Set the sign. */
+      return TAG_Zero;
+    }
+  else
+    {
+      significand(st0_ptr) = l;
+      return normalize_no_excep(st0_ptr, 63, sign);
+    }
+}
+/*===========================================================================*/
+/* Put a long double into user memory */
+int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, long double __user *d)
+{
+  /*
+    The only exception raised by an attempt to store to an
+    extended format is the Invalid Stack exception, i.e.
+    attempting to store from an empty register.
+   */
+  if ( st0_tag != TAG_Empty )
+    {
+      RE_ENTRANT_CHECK_OFF;
+      FPU_access_ok(VERIFY_WRITE, d, 10);
+      FPU_put_user(st0_ptr->sigl, (unsigned long __user *) d);
+      FPU_put_user(st0_ptr->sigh, (unsigned long __user *) ((u_char __user *)d + 4));
+      FPU_put_user(exponent16(st0_ptr), (unsigned short __user *) ((u_char __user *)d + 8));
+      RE_ENTRANT_CHECK_ON;
+      return 1;
+    }
+  /* Empty register (stack underflow) */
+  EXCEPTION(EX_StackUnder);
+  if ( control_word & CW_Invalid )
+    {
+      /* The masked response */
+      /* Put out the QNaN indefinite */
+      RE_ENTRANT_CHECK_OFF;
+      FPU_access_ok(VERIFY_WRITE,d,10);
+      FPU_put_user(0, (unsigned long __user *) d);
+      FPU_put_user(0xc0000000, 1 + (unsigned long __user *) d);
+      FPU_put_user(0xffff, 4 + (short __user *) d);
+      RE_ENTRANT_CHECK_ON;
+      return 1;
+    }
+  else
+    return 0;
+}
+/* Put a double into user memory */
+int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat)
+{
+  unsigned long l[2];
+  unsigned long increment = 0;  /* avoid gcc warnings */
+  int precision_loss;
+  int exp;
+  FPU_REG tmp;
+  if ( st0_tag == TAG_Valid )
+    {
+      reg_copy(st0_ptr, &tmp);
+      exp = exponent(&tmp);
+      if ( exp < DOUBLE_Emin )     /* It may be a denormal */
+        {
+          addexponent(&tmp, -DOUBLE_Emin + 52);  /* largest exp to be 51 */
+        denormal_arg:
+          if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) )
+            {
+#ifdef PECULIAR_486
+              /* Did it round to a non-denormal ? */
+              /* This behaviour might be regarded as peculiar, it appears
+                 that the 80486 rounds to the dest precision, then
+                 converts to decide underflow. */
+              if ( !((tmp.sigh == 0x00100000) && (tmp.sigl == 0) &&
+                  (st0_ptr->sigl & 0x000007ff)) )
+#endif /* PECULIAR_486 */
+                {
+                  EXCEPTION(EX_Underflow);
+                  /* This is a special case: see sec 16.2.5.1 of
+                     the 80486 book */
+                  if ( !(control_word & CW_Underflow) )
+                    return 0;
+                }
+              EXCEPTION(precision_loss);
+              if ( !(control_word & CW_Precision) )
+                return 0;
+            }
+          l[0] = tmp.sigl;
+          l[1] = tmp.sigh;
+        }
+      else
+        {
+          if ( tmp.sigl & 0x000007ff )
+            {
+              precision_loss = 1;
+              switch (control_word & CW_RC)
+                {
+                case RC_RND:
+                  /* Rounding can get a little messy.. */
+                  increment = ((tmp.sigl & 0x7ff) > 0x400) |  /* nearest */
+                    ((tmp.sigl & 0xc00) == 0xc00);            /* odd -> even */
+                  break;
+                case RC_DOWN:   /* towards -infinity */
+                  increment = signpositive(&tmp) ? 0 : tmp.sigl & 0x7ff;
+                  break;
+                case RC_UP:     /* towards +infinity */
+                  increment = signpositive(&tmp) ? tmp.sigl & 0x7ff : 0;
+                  break;
+                case RC_CHOP:
+                  increment = 0;
+                  break;
+                }
+          
+              /* Truncate the mantissa */
+              tmp.sigl &= 0xfffff800;
+          
+              if ( increment )
+                {
+                  if ( tmp.sigl >= 0xfffff800 )
+                    {
+                      /* the sigl part overflows */
+                      if ( tmp.sigh == 0xffffffff )
+                        {
+                          /* The sigh part overflows */
+                          tmp.sigh = 0x80000000;
+                          exp++;
+                          if (exp >= EXP_OVER)
+                            goto overflow;
+                        }
+                      else
+                        {
+                          tmp.sigh ++;
+                        }
+                      tmp.sigl = 0x00000000;
+                    }
+                  else
+                    {
+                      /* We only need to increment sigl */
+                      tmp.sigl += 0x00000800;
+                    }
+                }
+            }
+          else
+            precision_loss = 0;
+          
+          l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21);
+          l[1] = ((tmp.sigh >> 11) & 0xfffff);
+          if ( exp > DOUBLE_Emax )
+            {
+            overflow:
+              EXCEPTION(EX_Overflow);
+              if ( !(control_word & CW_Overflow) )
+                return 0;
+              set_precision_flag_up();
+              if ( !(control_word & CW_Precision) )
+                return 0;
+              /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+              /* Overflow to infinity */
+              l[0] = 0x00000000;        /* Set to */
+              l[1] = 0x7ff00000;        /* + INF */
+            }
+          else
+            {
+              if ( precision_loss )
+                {
+                  if ( increment )
+                    set_precision_flag_up();
+                  else
+                    set_precision_flag_down();
+                }
+              /* Add the exponent */
+              l[1] |= (((exp+DOUBLE_Ebias) & 0x7ff) << 20);
+            }
+        }
+    }
+  else if (st0_tag == TAG_Zero)
+    {
+      /* Number is zero */
+      l[0] = 0;
+      l[1] = 0;
+    }
+  else if ( st0_tag == TAG_Special )
+    {
+      st0_tag = FPU_Special(st0_ptr);
+      if ( st0_tag == TW_Denormal )
+        {
+          /* A denormal will always underflow. */
+#ifndef PECULIAR_486
+          /* An 80486 is supposed to be able to generate
+             a denormal exception here, but... */
+          /* Underflow has priority. */
+          if ( control_word & CW_Underflow )
+            denormal_operand();
+#endif /* PECULIAR_486 */
+          reg_copy(st0_ptr, &tmp);
+          goto denormal_arg;
+        }
+      else if (st0_tag == TW_Infinity)
+        {
+          l[0] = 0;
+          l[1] = 0x7ff00000;
+        }
+      else if (st0_tag == TW_NaN)
+        {
+          /* Is it really a NaN ? */
+          if ( (exponent(st0_ptr) == EXP_OVER)
+               && (st0_ptr->sigh & 0x80000000) )
+            {
+              /* See if we can get a valid NaN from the FPU_REG */
+              l[0] = (st0_ptr->sigl >> 11) | (st0_ptr->sigh << 21);
+              l[1] = ((st0_ptr->sigh >> 11) & 0xfffff);
+              if ( !(st0_ptr->sigh & 0x40000000) )
+                {
+                  /* It is a signalling NaN */
+                  EXCEPTION(EX_Invalid);
+                  if ( !(control_word & CW_Invalid) )
+                    return 0;
+                  l[1] |= (0x40000000 >> 11);
+                }
+              l[1] |= 0x7ff00000;
+            }
+          else
+            {
+              /* It is an unsupported data type */
+              EXCEPTION(EX_Invalid);
+              if ( !(control_word & CW_Invalid) )
+                return 0;
+              l[0] = 0;
+              l[1] = 0xfff80000;
+            }
+        }
+    }
+  else if ( st0_tag == TAG_Empty )
+    {
+      /* Empty register (stack underflow) */
+      EXCEPTION(EX_StackUnder);
+      if ( control_word & CW_Invalid )
+        {
+          /* The masked response */
+          /* Put out the QNaN indefinite */
+          RE_ENTRANT_CHECK_OFF;
+          FPU_access_ok(VERIFY_WRITE,dfloat,8);
+          FPU_put_user(0, (unsigned long __user *) dfloat);
+          FPU_put_user(0xfff80000, 1 + (unsigned long __user *) dfloat);
+          RE_ENTRANT_CHECK_ON;
+          return 1;
+        }
+      else
+        return 0;
+    }
+  if ( getsign(st0_ptr) )
+    l[1] |= 0x80000000;
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_WRITE,dfloat,8);
+  FPU_put_user(l[0], (unsigned long __user *)dfloat);
+  FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat);
+  RE_ENTRANT_CHECK_ON;
+  return 1;
+}
+/* Put a float into user memory */
+int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single)
+{
+  long templ = 0;
+  unsigned long increment = 0;          /* avoid gcc warnings */
+  int precision_loss;
+  int exp;
+  FPU_REG tmp;
+  if ( st0_tag == TAG_Valid )
+    {
+      reg_copy(st0_ptr, &tmp);
+      exp = exponent(&tmp);
+      if ( exp < SINGLE_Emin )
+        {
+          addexponent(&tmp, -SINGLE_Emin + 23);  /* largest exp to be 22 */
+        denormal_arg:
+          if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) )
+            {
+#ifdef PECULIAR_486
+              /* Did it round to a non-denormal ? */
+              /* This behaviour might be regarded as peculiar, it appears
+                 that the 80486 rounds to the dest precision, then
+                 converts to decide underflow. */
+              if ( !((tmp.sigl == 0x00800000) &&
+                  ((st0_ptr->sigh & 0x000000ff) || st0_ptr->sigl)) )
+#endif /* PECULIAR_486 */
+                {
+                  EXCEPTION(EX_Underflow);
+                  /* This is a special case: see sec 16.2.5.1 of
+                     the 80486 book */
+                  if ( !(control_word & CW_Underflow) )
+                    return 0;
+                }
+              EXCEPTION(precision_loss);
+              if ( !(control_word & CW_Precision) )
+                return 0;
+            }
+          templ = tmp.sigl;
+      }
+      else
+        {
+          if ( tmp.sigl | (tmp.sigh & 0x000000ff) )
+            {
+              unsigned long sigh = tmp.sigh;
+              unsigned long sigl = tmp.sigl;
+              
+              precision_loss = 1;
+              switch (control_word & CW_RC)
+                {
+                case RC_RND:
+                  increment = ((sigh & 0xff) > 0x80)       /* more than half */
+                    || (((sigh & 0xff) == 0x80) && sigl)   /* more than half */
+                    || ((sigh & 0x180) == 0x180);        /* round to even */
+                  break;
+                case RC_DOWN:   /* towards -infinity */
+                  increment = signpositive(&tmp)
+                    ? 0 : (sigl | (sigh & 0xff));
+                  break;
+                case RC_UP:     /* towards +infinity */
+                  increment = signpositive(&tmp)
+                    ? (sigl | (sigh & 0xff)) : 0;
+                  break;
+                case RC_CHOP:
+                  increment = 0;
+                  break;
+                }
+          
+              /* Truncate part of the mantissa */
+              tmp.sigl = 0;
+          
+              if (increment)
+                {
+                  if ( sigh >= 0xffffff00 )
+                    {
+                      /* The sigh part overflows */
+                      tmp.sigh = 0x80000000;
+                      exp++;
+                      if ( exp >= EXP_OVER )
+                        goto overflow;
+                    }
+                  else
+                    {
+                      tmp.sigh &= 0xffffff00;
+                      tmp.sigh += 0x100;
+                    }
+                }
+              else
+                {
+                  tmp.sigh &= 0xffffff00;  /* Finish the truncation */
+                }
+            }
+          else
+            precision_loss = 0;
+      
+          templ = (tmp.sigh >> 8) & 0x007fffff;
+          if ( exp > SINGLE_Emax )
+            {
+            overflow:
+              EXCEPTION(EX_Overflow);
+              if ( !(control_word & CW_Overflow) )
+                return 0;
+              set_precision_flag_up();
+              if ( !(control_word & CW_Precision) )
+                return 0;
+              /* This is a special case: see sec 16.2.5.1 of the 80486 book. */
+              /* Masked response is overflow to infinity. */
+              templ = 0x7f800000;
+            }
+          else
+            {
+              if ( precision_loss )
+                {
+                  if ( increment )
+                    set_precision_flag_up();
+                  else
+                    set_precision_flag_down();
+                }
+              /* Add the exponent */
+              templ |= ((exp+SINGLE_Ebias) & 0xff) << 23;
+            }
+        }
+    }
+  else if (st0_tag == TAG_Zero)
+    {
+      templ = 0;
+    }
+  else if ( st0_tag == TAG_Special )
+    {
+      st0_tag = FPU_Special(st0_ptr);
+      if (st0_tag == TW_Denormal)
+        {
+          reg_copy(st0_ptr, &tmp);
+          /* A denormal will always underflow. */
+#ifndef PECULIAR_486
+          /* An 80486 is supposed to be able to generate
+             a denormal exception here, but... */
+          /* Underflow has priority. */
+          if ( control_word & CW_Underflow )
+            denormal_operand();
+#endif /* PECULIAR_486 */ 
+          goto denormal_arg;
+        }
+      else if (st0_tag == TW_Infinity)
+        {
+          templ = 0x7f800000;
+        }
+      else if (st0_tag == TW_NaN)
+        {
+          /* Is it really a NaN ? */
+          if ( (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000) )
+            {
+              /* See if we can get a valid NaN from the FPU_REG */
+              templ = st0_ptr->sigh >> 8;
+              if ( !(st0_ptr->sigh & 0x40000000) )
+                {
+                  /* It is a signalling NaN */
+                  EXCEPTION(EX_Invalid);
+                  if ( !(control_word & CW_Invalid) )
+                    return 0;
+                  templ |= (0x40000000 >> 8);
+                }
+              templ |= 0x7f800000;
+            }
+          else
+            {
+              /* It is an unsupported data type */
+              EXCEPTION(EX_Invalid);
+              if ( !(control_word & CW_Invalid) )
+                return 0;
+              templ = 0xffc00000;
+            }
+        }
+#ifdef PARANOID
+      else
+        {
+          EXCEPTION(EX_INTERNAL|0x164);
+          return 0;
+        }
+#endif
+    }
+  else if ( st0_tag == TAG_Empty )
+    {
+      /* Empty register (stack underflow) */
+      EXCEPTION(EX_StackUnder);
+      if ( control_word & EX_Invalid )
+        {
+          /* The masked response */
+          /* Put out the QNaN indefinite */
+          RE_ENTRANT_CHECK_OFF;
+          FPU_access_ok(VERIFY_WRITE,single,4);
+          FPU_put_user(0xffc00000, (unsigned long __user *) single);
+          RE_ENTRANT_CHECK_ON;
+          return 1;
+        }
+      else
+        return 0;
+    }
+#ifdef PARANOID
+  else
+    {
+      EXCEPTION(EX_INTERNAL|0x163);
+      return 0;
+    }
+#endif
+  if ( getsign(st0_ptr) )
+    templ |= 0x80000000;
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_WRITE,single,4);
+  FPU_put_user(templ,(unsigned long __user *) single);
+  RE_ENTRANT_CHECK_ON;
+  return 1;
+}
+/* Put a long long into user memory */
+int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d)
+{
+  FPU_REG t;
+  long long tll;
+  int precision_loss;
+  if ( st0_tag == TAG_Empty )
+    {
+      /* Empty register (stack underflow) */
+      EXCEPTION(EX_StackUnder);
+      goto invalid_operand;
+    }
+  else if ( st0_tag == TAG_Special )
+    {
+      st0_tag = FPU_Special(st0_ptr);
+      if ( (st0_tag == TW_Infinity) ||
+           (st0_tag == TW_NaN) )
+        {
+          EXCEPTION(EX_Invalid);
+          goto invalid_operand;
+        }
+    }
+  reg_copy(st0_ptr, &t);
+  precision_loss = FPU_round_to_int(&t, st0_tag);
+  ((long *)&tll)[0] = t.sigl;
+  ((long *)&tll)[1] = t.sigh;
+  if ( (precision_loss == 1) ||
+      ((t.sigh & 0x80000000) &&
+       !((t.sigh == 0x80000000) && (t.sigl == 0) &&
+         signnegative(&t))) )
+    {
+      EXCEPTION(EX_Invalid);
+      /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+    invalid_operand:
+      if ( control_word & EX_Invalid )
+        {
+          /* Produce something like QNaN "indefinite" */
+          tll = 0x8000000000000000LL;
+        }
+      else
+        return 0;
+    }
+  else
+    {
+      if ( precision_loss )
+        set_precision_flag(precision_loss);
+      if ( signnegative(&t) )
+        tll = - tll;
+    }
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_WRITE,d,8);
+  if (copy_to_user(d, &tll, 8))
+    FPU_abort;
+  RE_ENTRANT_CHECK_ON;
+  return 1;
+}
+/* Put a long into user memory */
+int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d)
+{
+  FPU_REG t;
+  int precision_loss;
+  if ( st0_tag == TAG_Empty )
+    {
+      /* Empty register (stack underflow) */
+      EXCEPTION(EX_StackUnder);
+      goto invalid_operand;
+    }
+  else if ( st0_tag == TAG_Special )
+    {
+      st0_tag = FPU_Special(st0_ptr);
+      if ( (st0_tag == TW_Infinity) ||
+           (st0_tag == TW_NaN) )
+        {
+          EXCEPTION(EX_Invalid);
+          goto invalid_operand;
+        }
+    }
+  reg_copy(st0_ptr, &t);
+  precision_loss = FPU_round_to_int(&t, st0_tag);
+  if (t.sigh ||
+      ((t.sigl & 0x80000000) &&
+       !((t.sigl == 0x80000000) && signnegative(&t))) )
+    {
+      EXCEPTION(EX_Invalid);
+      /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+    invalid_operand:
+      if ( control_word & EX_Invalid )
+        {
+          /* Produce something like QNaN "indefinite" */
+          t.sigl = 0x80000000;
+        }
+      else
+        return 0;
+    }
+  else
+    {
+      if ( precision_loss )
+        set_precision_flag(precision_loss);
+      if ( signnegative(&t) )
+        t.sigl = -(long)t.sigl;
+    }
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_WRITE,d,4);
+  FPU_put_user(t.sigl, (unsigned long __user *) d);
+  RE_ENTRANT_CHECK_ON;
+  return 1;
+}
+/* Put a short into user memory */
+int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d)
+{
+  FPU_REG t;
+  int precision_loss;
+  if ( st0_tag == TAG_Empty )
+    {
+      /* Empty register (stack underflow) */
+      EXCEPTION(EX_StackUnder);
+      goto invalid_operand;
+    }
+  else if ( st0_tag == TAG_Special )
+    {
+      st0_tag = FPU_Special(st0_ptr);
+      if ( (st0_tag == TW_Infinity) ||
+           (st0_tag == TW_NaN) )
+        {
+          EXCEPTION(EX_Invalid);
+          goto invalid_operand;
+        }
+    }
+  reg_copy(st0_ptr, &t);
+  precision_loss = FPU_round_to_int(&t, st0_tag);
+  if (t.sigh ||
+      ((t.sigl & 0xffff8000) &&
+       !((t.sigl == 0x8000) && signnegative(&t))) )
+    {
+      EXCEPTION(EX_Invalid);
+      /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+    invalid_operand:
+      if ( control_word & EX_Invalid )
+        {
+          /* Produce something like QNaN "indefinite" */
+          t.sigl = 0x8000;
+        }
+      else
+        return 0;
+    }
+  else
+    {
+      if ( precision_loss )
+        set_precision_flag(precision_loss);
+      if ( signnegative(&t) )
+        t.sigl = -t.sigl;
+    }
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_WRITE,d,2);
+  FPU_put_user((short)t.sigl, d);
+  RE_ENTRANT_CHECK_ON;
+  return 1;
+}
+/* Put a packed bcd array into user memory */
+int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d)
+{
+  FPU_REG t;
+  unsigned long long ll;
+  u_char b;
+  int i, precision_loss;
+  u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0;
+  if ( st0_tag == TAG_Empty )
+    {
+      /* Empty register (stack underflow) */
+      EXCEPTION(EX_StackUnder);
+      goto invalid_operand;
+    }
+  else if ( st0_tag == TAG_Special )
+    {
+      st0_tag = FPU_Special(st0_ptr);
+      if ( (st0_tag == TW_Infinity) ||
+           (st0_tag == TW_NaN) )
+        {
+          EXCEPTION(EX_Invalid);
+          goto invalid_operand;
+        }
+    }
+  reg_copy(st0_ptr, &t);
+  precision_loss = FPU_round_to_int(&t, st0_tag);
+  ll = significand(&t);
+  /* Check for overflow, by comparing with 999999999999999999 decimal. */
+  if ( (t.sigh > 0x0de0b6b3) ||
+      ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff)) )
+    {
+      EXCEPTION(EX_Invalid);
+      /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+    invalid_operand:
+      if ( control_word & CW_Invalid )
+        {
+          /* Produce the QNaN "indefinite" */
+          RE_ENTRANT_CHECK_OFF;
+          FPU_access_ok(VERIFY_WRITE,d,10);
+          for ( i = 0; i < 7; i++)
+            FPU_put_user(0, d+i); /* These bytes "undefined" */
+          FPU_put_user(0xc0, d+7); /* This byte "undefined" */
+          FPU_put_user(0xff, d+8);
+          FPU_put_user(0xff, d+9);
+          RE_ENTRANT_CHECK_ON;
+          return 1;
+        }
+      else
+        return 0;
+    }
+  else if ( precision_loss )
+    {
+      /* Precision loss doesn't stop the data transfer */
+      set_precision_flag(precision_loss);
+    }
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_WRITE,d,10);
+  RE_ENTRANT_CHECK_ON;
+  for ( i = 0; i < 9; i++)
+    {
+      b = FPU_div_small(&ll, 10);
+      b |= (FPU_div_small(&ll, 10)) << 4;
+      RE_ENTRANT_CHECK_OFF;
+      FPU_put_user(b, d+i);
+      RE_ENTRANT_CHECK_ON;
+    }
+  RE_ENTRANT_CHECK_OFF;
+  FPU_put_user(sign, d+9);
+  RE_ENTRANT_CHECK_ON;
+  return 1;
+}
+/*===========================================================================*/
+/* r gets mangled such that sig is int, sign: 
+   it is NOT normalized */
+/* The return value (in eax) is zero if the result is exact,
+   if bits are changed due to rounding, truncation, etc, then
+   a non-zero value is returned */
+/* Overflow is signalled by a non-zero return value (in eax).
+   In the case of overflow, the returned significand always has the
+   largest possible value */
+int FPU_round_to_int(FPU_REG *r, u_char tag)
+{
+  u_char     very_big;
+  unsigned eax;
+  if (tag == TAG_Zero)
+    {
+      /* Make sure that zero is returned */
+      significand(r) = 0;
+      return 0;        /* o.k. */
+    }
+  if (exponent(r) > 63)
+    {
+      r->sigl = r->sigh = ~0;      /* The largest representable number */
+      return 1;        /* overflow */
+    }
+  eax = FPU_shrxs(&r->sigl, 63 - exponent(r));
+  very_big = !(~(r->sigh) | ~(r->sigl));  /* test for 0xfff...fff */
+#define half_or_more    (eax & 0x80000000)
+#define frac_part       (eax)
+#define more_than_half  ((eax & 0x80000001) == 0x80000001)
+  switch (control_word & CW_RC)
+    {
+    case RC_RND:
+      if ( more_than_half                       /* nearest */
+          || (half_or_more && (r->sigl & 1)) )  /* odd -> even */
+        {
+          if ( very_big ) return 1;        /* overflow */
+          significand(r) ++;
+          return PRECISION_LOST_UP;
+        }
+      break;
+    case RC_DOWN:
+      if (frac_part && getsign(r))
+        {
+          if ( very_big ) return 1;        /* overflow */
+          significand(r) ++;
+          return PRECISION_LOST_UP;
+        }
+      break;
+    case RC_UP:
+      if (frac_part && !getsign(r))
+        {
+          if ( very_big ) return 1;        /* overflow */
+          significand(r) ++;
+          return PRECISION_LOST_UP;
+        }
+      break;
+    case RC_CHOP:
+      break;
+    }
+  return eax ? PRECISION_LOST_DOWN : 0;
+}
+/*===========================================================================*/
+u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s)
+{
+  unsigned short tag_word = 0;
+  u_char tag;
+  int i;
+  if ( (addr_modes.default_mode == VM86) ||
+      ((addr_modes.default_mode == PM16)
+      ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) )
+    {
+      RE_ENTRANT_CHECK_OFF;
+      FPU_access_ok(VERIFY_READ, s, 0x0e);
+      FPU_get_user(control_word, (unsigned short __user *) s);
+      FPU_get_user(partial_status, (unsigned short __user *) (s+2));
+      FPU_get_user(tag_word, (unsigned short __user *) (s+4));
+      FPU_get_user(instruction_address.offset, (unsigned short __user *) (s+6));
+      FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+8));
+      FPU_get_user(operand_address.offset, (unsigned short __user *) (s+0x0a));
+      FPU_get_user(operand_address.selector, (unsigned short __user *) (s+0x0c));
+      RE_ENTRANT_CHECK_ON;
+      s += 0x0e;
+      if ( addr_modes.default_mode == VM86 )
+        {
+          instruction_address.offset
+            += (instruction_address.selector & 0xf000) << 4;
+          operand_address.offset += (operand_address.selector & 0xf000) << 4;
+        }
+    }
+  else
+    {
+      RE_ENTRANT_CHECK_OFF;
+      FPU_access_ok(VERIFY_READ, s, 0x1c);
+      FPU_get_user(control_word, (unsigned short __user *) s);
+      FPU_get_user(partial_status, (unsigned short __user *) (s+4));
+      FPU_get_user(tag_word, (unsigned short __user *) (s+8));
+      FPU_get_user(instruction_address.offset, (unsigned long __user *) (s+0x0c));
+      FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+0x10));
+      FPU_get_user(instruction_address.opcode, (unsigned short __user *) (s+0x12));
+      FPU_get_user(operand_address.offset, (unsigned long __user *) (s+0x14));
+      FPU_get_user(operand_address.selector, (unsigned long __user *) (s+0x18));
+      RE_ENTRANT_CHECK_ON;
+      s += 0x1c;
+    }
+#ifdef PECULIAR_486
+  control_word &= ~0xe080;
+#endif /* PECULIAR_486 */ 
+  top = (partial_status >> SW_Top_Shift) & 7;
+  if ( partial_status & ~control_word & CW_Exceptions )
+    partial_status |= (SW_Summary | SW_Backward);
+  else
+    partial_status &= ~(SW_Summary | SW_Backward);
+  for ( i = 0; i < 8; i++ )
+    {
+      tag = tag_word & 3;
+      tag_word >>= 2;
+      if ( tag == TAG_Empty )
+        /* New tag is empty.  Accept it */
+        FPU_settag(i, TAG_Empty);
+      else if ( FPU_gettag(i) == TAG_Empty )
+        {
+          /* Old tag is empty and new tag is not empty.  New tag is determined
+             by old reg contents */
+          if ( exponent(&fpu_register(i)) == - EXTENDED_Ebias )
+            {
+              if ( !(fpu_register(i).sigl | fpu_register(i).sigh) )
+                FPU_settag(i, TAG_Zero);
+              else
+                FPU_settag(i, TAG_Special);
+            }
+          else if ( exponent(&fpu_register(i)) == 0x7fff - EXTENDED_Ebias )
+            {
+              FPU_settag(i, TAG_Special);
+            }
+          else if ( fpu_register(i).sigh & 0x80000000 )
+            FPU_settag(i, TAG_Valid);
+          else
+            FPU_settag(i, TAG_Special);   /* An Un-normal */
+        }
+      /* Else old tag is not empty and new tag is not empty.  Old tag
+         remains correct */
+    }
+  return s;
+}
+void frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
+{
+  int i, regnr;
+  u_char __user *s = fldenv(addr_modes, data_address);
+  int offset = (top & 7) * 10, other = 80 - offset;
+  /* Copy all registers in stack order. */
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_READ,s,80);
+  __copy_from_user(register_base+offset, s, other);
+  if ( offset )
+    __copy_from_user(register_base, s+other, offset);
+  RE_ENTRANT_CHECK_ON;
+  for ( i = 0; i < 8; i++ )
+    {
+      regnr = (i+top) & 7;
+      if ( FPU_gettag(regnr) != TAG_Empty )
+        /* The loaded data over-rides all other cases. */
+        FPU_settag(regnr, FPU_tagof(&st(i)));
+    }
+}
+u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
+{
+  if ( (addr_modes.default_mode == VM86) ||
+      ((addr_modes.default_mode == PM16)
+      ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) )
+    {
+      RE_ENTRANT_CHECK_OFF;
+      FPU_access_ok(VERIFY_WRITE,d,14);
+#ifdef PECULIAR_486
+      FPU_put_user(control_word & ~0xe080, (unsigned long __user *) d);
+#else
+      FPU_put_user(control_word, (unsigned short __user *) d);
+#endif /* PECULIAR_486 */
+      FPU_put_user(status_word(), (unsigned short __user *) (d+2));
+      FPU_put_user(fpu_tag_word, (unsigned short __user *) (d+4));
+      FPU_put_user(instruction_address.offset, (unsigned short __user *) (d+6));
+      FPU_put_user(operand_address.offset, (unsigned short __user *) (d+0x0a));
+      if ( addr_modes.default_mode == VM86 )
+        {
+          FPU_put_user((instruction_address.offset & 0xf0000) >> 4,
+                      (unsigned short __user *) (d+8));
+          FPU_put_user((operand_address.offset & 0xf0000) >> 4,
+                      (unsigned short __user *) (d+0x0c));
+        }
+      else
+        {
+          FPU_put_user(instruction_address.selector, (unsigned short __user *) (d+8));
+          FPU_put_user(operand_address.selector, (unsigned short __user *) (d+0x0c));
+        }
+      RE_ENTRANT_CHECK_ON;
+      d += 0x0e;
+    }
+  else
+    {
+      RE_ENTRANT_CHECK_OFF;
+      FPU_access_ok(VERIFY_WRITE, d, 7*4);
+#ifdef PECULIAR_486
+      control_word &= ~0xe080;
+      /* An 80486 sets nearly all of the reserved bits to 1. */
+      control_word |= 0xffff0040;
+      partial_status = status_word() | 0xffff0000;
+      fpu_tag_word |= 0xffff0000;
+      I387.soft.fcs &= ~0xf8000000;
+      I387.soft.fos |= 0xffff0000;
+#endif /* PECULIAR_486 */
+      if (__copy_to_user(d, &control_word, 7*4))
+        FPU_abort;
+      RE_ENTRANT_CHECK_ON;
+      d += 0x1c;
+    }
+  
+  control_word |= CW_Exceptions;
+  partial_status &= ~(SW_Summary | SW_Backward);
+  return d;
+}
+void fsave(fpu_addr_modes addr_modes, u_char __user *data_address)
+{
+  u_char __user *d;
+  int offset = (top & 7) * 10, other = 80 - offset;
+  d = fstenv(addr_modes, data_address);
+  RE_ENTRANT_CHECK_OFF;
+  FPU_access_ok(VERIFY_WRITE,d,80);
+  /* Copy all registers in stack order. */
+  if (__copy_to_user(d, register_base+offset, other))
+    FPU_abort;
+  if ( offset )
+    if (__copy_to_user(d+other, register_base, offset))
+      FPU_abort;
+  RE_ENTRANT_CHECK_ON;
+  finit();
+}
+/*===========================================================================*/
diff --git a/arch/x86/math-emu/reg_mul.c b/arch/x86/math-emu/reg_mul.c
new file mode 100644
index 000000000000..40f50b61bc67
--- /dev/null
+++ b/arch/x86/math-emu/reg_mul.c
@@ -0,0 +1,132 @@
+/*---------------------------------------------------------------------------+
+ |  reg_mul.c                                                                |
+ |                                                                           |
+ | Multiply one FPU_REG by another, put the result in a destination FPU_REG. |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1997                                              |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ | Returns the tag of the result if no exceptions or errors occurred.        |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ | The destination may be any FPU_REG, including one of the source FPU_REGs. |
+ +---------------------------------------------------------------------------*/
+#include "fpu_emu.h"
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_system.h"
+/*
+  Multiply two registers to give a register result.
+  The sources are st(deststnr) and (b,tagb,signb).
+  The destination is st(deststnr).
+  */
+/* This routine must be called with non-empty source registers */
+int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w)
+{
+  FPU_REG *a = &st(deststnr);
+  FPU_REG *dest = a;
+  u_char taga = FPU_gettagi(deststnr);
+  u_char saved_sign = getsign(dest);
+  u_char sign = (getsign(a) ^ getsign(b));
+  int tag;
+  if ( !(taga | tagb) )
+    {
+      /* Both regs Valid, this should be the most common case. */
+      tag = FPU_u_mul(a, b, dest, control_w, sign, exponent(a) + exponent(b));
+      if ( tag < 0 )
+        {
+          setsign(dest, saved_sign);
+          return tag;
+        }
+      FPU_settagi(deststnr, tag);
+      return tag;
+    }
+  if ( taga == TAG_Special )
+    taga = FPU_Special(a);
+  if ( tagb == TAG_Special )
+    tagb = FPU_Special(b);
+  if ( ((taga == TAG_Valid) && (tagb == TW_Denormal))
+            || ((taga == TW_Denormal) && (tagb == TAG_Valid))
+            || ((taga == TW_Denormal) && (tagb == TW_Denormal)) )
+    {
+      FPU_REG x, y;
+      if ( denormal_operand() < 0 )
+        return FPU_Exception;
+      FPU_to_exp16(a, &x);
+      FPU_to_exp16(b, &y);
+      tag = FPU_u_mul(&x, &y, dest, control_w, sign,
+                      exponent16(&x) + exponent16(&y));
+      if ( tag < 0 )
+        {
+          setsign(dest, saved_sign);
+          return tag;
+        }
+      FPU_settagi(deststnr, tag);
+      return tag;
+    }
+  else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) )
+    {
+      if ( ((tagb == TW_Denormal) || (taga == TW_Denormal))
+           && (denormal_operand() < 0) )
+        return FPU_Exception;
+      /* Must have either both arguments == zero, or
+         one valid and the other zero.
+         The result is therefore zero. */
+      FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+      /* The 80486 book says that the answer is +0, but a real
+         80486 behaves this way.
+         IEEE-754 apparently says it should be this way. */
+      setsign(dest, sign);
+      return TAG_Zero;
+    }
+      /* Must have infinities, NaNs, etc */
+  else if ( (taga == TW_NaN) || (tagb == TW_NaN) )
+    {
+      return real_2op_NaN(b, tagb, deststnr, &st(0));
+    }
+  else if ( ((taga == TW_Infinity) && (tagb == TAG_Zero))
+            || ((tagb == TW_Infinity) && (taga == TAG_Zero)) )
+    {
+      return arith_invalid(deststnr);  /* Zero*Infinity is invalid */
+    }
+  else if ( ((taga == TW_Denormal) || (tagb == TW_Denormal))
+            && (denormal_operand() < 0) )
+    {
+      return FPU_Exception;
+    }
+  else if (taga == TW_Infinity)
+    {
+      FPU_copy_to_regi(a, TAG_Special, deststnr);
+      setsign(dest, sign);
+      return TAG_Special;
+    }
+  else if (tagb == TW_Infinity)
+    {
+      FPU_copy_to_regi(b, TAG_Special, deststnr);
+      setsign(dest, sign);
+      return TAG_Special;
+    }
+#ifdef PARANOID
+  else
+    {
+      EXCEPTION(EX_INTERNAL|0x102);
+      return FPU_Exception;
+    }
+#endif /* PARANOID */ 
+        return 0;
+}
diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S
new file mode 100644
index 000000000000..8b6352efceef
--- /dev/null
+++ b/arch/x86/math-emu/reg_norm.S
@@ -0,0 +1,147 @@
+/*---------------------------------------------------------------------------+
+ |  reg_norm.S                                                               |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1995,1997                                    |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail billm@suburbia.net               |
+ |                                                                           |
+ | Normalize the value in a FPU_REG.                                         |
+ |                                                                           |
+ | Call from C as:                                                           |
+ |    int FPU_normalize(FPU_REG *n)                                          |
+ |                                                                           |
+ |    int FPU_normalize_nuo(FPU_REG *n)                                      |
+ |                                                                           |
+ |    Return value is the tag of the answer, or-ed with FPU_Exception if     |
+ |    one was raised, or -1 on internal error.                               |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "fpu_emu.h"
+.text
+ENTRY(FPU_normalize)
+        pushl   %ebp
+        movl    %esp,%ebp
+        pushl   %ebx
+        movl    PARAM1,%ebx
+        movl    SIGH(%ebx),%edx
+        movl    SIGL(%ebx),%eax
+        orl     %edx,%edx       /* ms bits */
+        js      L_done          /* Already normalized */
+        jnz     L_shift_1       /* Shift left 1 - 31 bits */
+        orl     %eax,%eax
+        jz      L_zero          /* The contents are zero */
+        movl    %eax,%edx
+        xorl    %eax,%eax
+        subw    $32,EXP(%ebx)   /* This can cause an underflow */
+/* We need to shift left by 1 - 31 bits */
+L_shift_1:
+        bsrl    %edx,%ecx       /* get the required shift in %ecx */
+        subl    $31,%ecx
+        negl    %ecx
+        shld    %cl,%eax,%edx
+        shl     %cl,%eax
+        subw    %cx,EXP(%ebx)   /* This can cause an underflow */
+        movl    %edx,SIGH(%ebx)
+        movl    %eax,SIGL(%ebx)
+L_done:
+        cmpw    EXP_OVER,EXP(%ebx)
+        jge     L_overflow
+        cmpw    EXP_UNDER,EXP(%ebx)
+        jle     L_underflow
+L_exit_valid:
+        movl    TAG_Valid,%eax
+        /* Convert the exponent to 80x87 form. */
+        addw    EXTENDED_Ebias,EXP(%ebx)
+        andw    $0x7fff,EXP(%ebx)
+L_exit:
+        popl    %ebx
+        leave
+        ret
+L_zero:
+        movw    $0,EXP(%ebx)
+        movl    TAG_Zero,%eax
+        jmp     L_exit
+L_underflow:
+        /* Convert the exponent to 80x87 form. */
+        addw    EXTENDED_Ebias,EXP(%ebx)
+        push    %ebx
+        call    arith_underflow
+        pop     %ebx
+        jmp     L_exit
+L_overflow:
+        /* Convert the exponent to 80x87 form. */
+        addw    EXTENDED_Ebias,EXP(%ebx)
+        push    %ebx
+        call    arith_overflow
+        pop     %ebx
+        jmp     L_exit
+/* Normalise without reporting underflow or overflow */
+ENTRY(FPU_normalize_nuo)
+        pushl   %ebp
+        movl    %esp,%ebp
+        pushl   %ebx
+        movl    PARAM1,%ebx
+        movl    SIGH(%ebx),%edx
+        movl    SIGL(%ebx),%eax
+        orl     %edx,%edx       /* ms bits */
+        js      L_exit_nuo_valid        /* Already normalized */
+        jnz     L_nuo_shift_1   /* Shift left 1 - 31 bits */
+        orl     %eax,%eax
+        jz      L_exit_nuo_zero         /* The contents are zero */
+        movl    %eax,%edx
+        xorl    %eax,%eax
+        subw    $32,EXP(%ebx)   /* This can cause an underflow */
+/* We need to shift left by 1 - 31 bits */
+L_nuo_shift_1:
+        bsrl    %edx,%ecx       /* get the required shift in %ecx */
+        subl    $31,%ecx
+        negl    %ecx
+        shld    %cl,%eax,%edx
+        shl     %cl,%eax
+        subw    %cx,EXP(%ebx)   /* This can cause an underflow */
+        movl    %edx,SIGH(%ebx)
+        movl    %eax,SIGL(%ebx)
+L_exit_nuo_valid:
+        movl    TAG_Valid,%eax
+        popl    %ebx
+        leave
+        ret
+L_exit_nuo_zero:
+        movl    TAG_Zero,%eax
+        movw    EXP_UNDER,EXP(%ebx)
+        popl    %ebx
+        leave
+        ret
diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S
new file mode 100644
index 000000000000..d1d4e48b4f67
--- /dev/null
+++ b/arch/x86/math-emu/reg_round.S
@@ -0,0 +1,708 @@
+        .file "reg_round.S"
+/*---------------------------------------------------------------------------+
+ |  reg_round.S                                                              |
+ |                                                                           |
+ | Rounding/truncation/etc for FPU basic arithmetic functions.               |
+ |                                                                           |
+ | Copyright (C) 1993,1995,1997                                              |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail billm@suburbia.net               |
+ |                                                                           |
+ | This code has four possible entry points.                                 |
+ | The following must be entered by a jmp instruction:                       |
+ |   fpu_reg_round, fpu_reg_round_sqrt, and fpu_Arith_exit.                  |
+ |                                                                           |
+ | The FPU_round entry point is intended to be used by C code.               |
+ | From C, call as:                                                          |
+ |  int FPU_round(FPU_REG *arg, unsigned int extent, unsigned int control_w) |
+ |                                                                           |
+ |    Return value is the tag of the answer, or-ed with FPU_Exception if     |
+ |    one was raised, or -1 on internal error.                               |
+ |                                                                           |
+ | For correct "up" and "down" rounding, the argument must have the correct  |
+ | sign.                                                                     |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ | Four entry points.                                                        |
+ |                                                                           |
+ | Needed by both the fpu_reg_round and fpu_reg_round_sqrt entry points:     |
+ |  %eax:%ebx  64 bit significand                                            |
+ |  %edx       32 bit extension of the significand                           |
+ |  %edi       pointer to an FPU_REG for the result to be stored             |
+ |  stack      calling function must have set up a C stack frame and         |
+ |             pushed %esi, %edi, and %ebx                                   |
+ |                                                                           |
+ | Needed just for the fpu_reg_round_sqrt entry point:                       |
+ |  %cx  A control word in the same format as the FPU control word.          |
+ | Otherwise, PARAM4 must give such a value.                                 |
+ |                                                                           |
+ |                                                                           |
+ | The significand and its extension are assumed to be exact in the          |
+ | following sense:                                                          |
+ |   If the significand by itself is the exact result then the significand   |
+ |   extension (%edx) must contain 0, otherwise the significand extension    |
+ |   must be non-zero.                                                       |
+ |   If the significand extension is non-zero then the significand is        |
+ |   smaller than the magnitude of the correct exact result by an amount     |
+ |   greater than zero and less than one ls bit of the significand.          |
+ |   The significand extension is only required to have three possible       |
+ |   non-zero values:                                                        |
+ |       less than 0x80000000  <=> the significand is less than 1/2 an ls    |
+ |                                 bit smaller than the magnitude of the     |
+ |                                 true exact result.                        |
+ |         exactly 0x80000000  <=> the significand is exactly 1/2 an ls bit  |
+ |                                 smaller than the magnitude of the true    |
+ |                                 exact result.                             |
+ |    greater than 0x80000000  <=> the significand is more than 1/2 an ls    |
+ |                                 bit smaller than the magnitude of the     |
+ |                                 true exact result.                        |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ |  The code in this module has become quite complex, but it should handle   |
+ |  all of the FPU flags which are set at this stage of the basic arithmetic |
+ |  computations.                                                            |
+ |  There are a few rare cases where the results are not set identically to  |
+ |  a real FPU. These require a bit more thought because at this stage the   |
+ |  results of the code here appear to be more consistent...                 |
+ |  This may be changed in a future version.                                 |
+ +---------------------------------------------------------------------------*/
+#include "fpu_emu.h"
+#include "exception.h"
+#include "control_w.h"
+/* Flags for FPU_bits_lost */
+#define LOST_DOWN       $1
+#define LOST_UP         $2
+/* Flags for FPU_denormal */
+#define DENORMAL        $1
+#define UNMASKED_UNDERFLOW $2
+#ifndef NON_REENTRANT_FPU
+/*      Make the code re-entrant by putting
+        local storage on the stack: */
+#define FPU_bits_lost   (%esp)
+#define FPU_denormal    1(%esp)
+#else
+/*      Not re-entrant, so we can gain speed by putting
+        local storage in a static area: */
+.data
+        .align 4,0
+FPU_bits_lost:
+        .byte   0
+FPU_denormal:
+        .byte   0
+#endif /* NON_REENTRANT_FPU */
+.text
+.globl fpu_reg_round
+.globl fpu_Arith_exit
+/* Entry point when called from C */
+ENTRY(FPU_round)
+        pushl   %ebp
+        movl    %esp,%ebp
+        pushl   %esi
+        pushl   %edi
+        pushl   %ebx
+        movl    PARAM1,%edi
+        movl    SIGH(%edi),%eax
+        movl    SIGL(%edi),%ebx
+        movl    PARAM2,%edx
+fpu_reg_round:                  /* Normal entry point */
+        movl    PARAM4,%ecx
+#ifndef NON_REENTRANT_FPU
+        pushl   %ebx            /* adjust the stack pointer */
+#endif /* NON_REENTRANT_FPU */ 
+#ifdef PARANOID
+/* Cannot use this here yet */
+/*      orl     %eax,%eax */
+/*      jns     L_entry_bugged */
+#endif /* PARANOID */
+        cmpw    EXP_UNDER,EXP(%edi)
+        jle     L_Make_denorm                   /* The number is a de-normal */
+        movb    $0,FPU_denormal                 /* 0 -> not a de-normal */
+Denorm_done:
+        movb    $0,FPU_bits_lost                /* No bits yet lost in rounding */
+        movl    %ecx,%esi
+        andl    CW_PC,%ecx
+        cmpl    PR_64_BITS,%ecx
+        je      LRound_To_64
+        cmpl    PR_53_BITS,%ecx
+        je      LRound_To_53
+        cmpl    PR_24_BITS,%ecx
+        je      LRound_To_24
+#ifdef PECULIAR_486
+/* With the precision control bits set to 01 "(reserved)", a real 80486
+   behaves as if the precision control bits were set to 11 "64 bits" */
+        cmpl    PR_RESERVED_BITS,%ecx
+        je      LRound_To_64
+#ifdef PARANOID
+        jmp     L_bugged_denorm_486
+#endif /* PARANOID */ 
+#else
+#ifdef PARANOID
+        jmp     L_bugged_denorm /* There is no bug, just a bad control word */
+#endif /* PARANOID */ 
+#endif /* PECULIAR_486 */
+/* Round etc to 24 bit precision */
+LRound_To_24:
+        movl    %esi,%ecx
+        andl    CW_RC,%ecx
+        cmpl    RC_RND,%ecx
+        je      LRound_nearest_24
+        cmpl    RC_CHOP,%ecx
+        je      LCheck_truncate_24
+        cmpl    RC_UP,%ecx              /* Towards +infinity */
+        je      LUp_24
+        cmpl    RC_DOWN,%ecx            /* Towards -infinity */
+        je      LDown_24
+#ifdef PARANOID
+        jmp     L_bugged_round24
+#endif /* PARANOID */ 
+LUp_24:
+        cmpb    SIGN_POS,PARAM5
+        jne     LCheck_truncate_24      /* If negative then  up==truncate */
+        jmp     LCheck_24_round_up
+LDown_24:
+        cmpb    SIGN_POS,PARAM5
+        je      LCheck_truncate_24      /* If positive then  down==truncate */
+LCheck_24_round_up:
+        movl    %eax,%ecx
+        andl    $0x000000ff,%ecx
+        orl     %ebx,%ecx
+        orl     %edx,%ecx
+        jnz     LDo_24_round_up
+        jmp     L_Re_normalise
+LRound_nearest_24:
+        /* Do rounding of the 24th bit if needed (nearest or even) */
+        movl    %eax,%ecx
+        andl    $0x000000ff,%ecx
+        cmpl    $0x00000080,%ecx
+        jc      LCheck_truncate_24      /* less than half, no increment needed */
+        jne     LGreater_Half_24        /* greater than half, increment needed */
+        /* Possibly half, we need to check the ls bits */
+        orl     %ebx,%ebx
+        jnz     LGreater_Half_24        /* greater than half, increment needed */
+        orl     %edx,%edx
+        jnz     LGreater_Half_24        /* greater than half, increment needed */
+        /* Exactly half, increment only if 24th bit is 1 (round to even) */
+        testl   $0x00000100,%eax
+        jz      LDo_truncate_24
+LGreater_Half_24:                       /* Rounding: increment at the 24th bit */
+LDo_24_round_up:
+        andl    $0xffffff00,%eax        /* Truncate to 24 bits */
+        xorl    %ebx,%ebx
+        movb    LOST_UP,FPU_bits_lost
+        addl    $0x00000100,%eax
+        jmp     LCheck_Round_Overflow
+LCheck_truncate_24:
+        movl    %eax,%ecx
+        andl    $0x000000ff,%ecx
+        orl     %ebx,%ecx
+        orl     %edx,%ecx
+        jz      L_Re_normalise          /* No truncation needed */
+LDo_truncate_24:
+        andl    $0xffffff00,%eax        /* Truncate to 24 bits */
+        xorl    %ebx,%ebx
+        movb    LOST_DOWN,FPU_bits_lost
+        jmp     L_Re_normalise
+/* Round etc to 53 bit precision */
+LRound_To_53:
+        movl    %esi,%ecx
+        andl    CW_RC,%ecx
+        cmpl    RC_RND,%ecx
+        je      LRound_nearest_53
+        cmpl    RC_CHOP,%ecx
+        je      LCheck_truncate_53
+        cmpl    RC_UP,%ecx              /* Towards +infinity */
+        je      LUp_53
+        cmpl    RC_DOWN,%ecx            /* Towards -infinity */
+        je      LDown_53
+#ifdef PARANOID
+        jmp     L_bugged_round53
+#endif /* PARANOID */ 
+LUp_53:
+        cmpb    SIGN_POS,PARAM5
+        jne     LCheck_truncate_53      /* If negative then  up==truncate */
+        jmp     LCheck_53_round_up
+LDown_53:
+        cmpb    SIGN_POS,PARAM5
+        je      LCheck_truncate_53      /* If positive then  down==truncate */
+LCheck_53_round_up:
+        movl    %ebx,%ecx
+        andl    $0x000007ff,%ecx
+        orl     %edx,%ecx
+        jnz     LDo_53_round_up
+        jmp     L_Re_normalise
+LRound_nearest_53:
+        /* Do rounding of the 53rd bit if needed (nearest or even) */
+        movl    %ebx,%ecx
+        andl    $0x000007ff,%ecx
+        cmpl    $0x00000400,%ecx
+        jc      LCheck_truncate_53      /* less than half, no increment needed */
+        jnz     LGreater_Half_53        /* greater than half, increment needed */
+        /* Possibly half, we need to check the ls bits */
+        orl     %edx,%edx
+        jnz     LGreater_Half_53        /* greater than half, increment needed */
+        /* Exactly half, increment only if 53rd bit is 1 (round to even) */
+        testl   $0x00000800,%ebx
+        jz      LTruncate_53
+LGreater_Half_53:                       /* Rounding: increment at the 53rd bit */
+LDo_53_round_up:
+        movb    LOST_UP,FPU_bits_lost
+        andl    $0xfffff800,%ebx        /* Truncate to 53 bits */
+        addl    $0x00000800,%ebx
+        adcl    $0,%eax
+        jmp     LCheck_Round_Overflow
+LCheck_truncate_53:
+        movl    %ebx,%ecx
+        andl    $0x000007ff,%ecx
+        orl     %edx,%ecx
+        jz      L_Re_normalise
+LTruncate_53:
+        movb    LOST_DOWN,FPU_bits_lost
+        andl    $0xfffff800,%ebx        /* Truncate to 53 bits */
+        jmp     L_Re_normalise
+/* Round etc to 64 bit precision */
+LRound_To_64:
+        movl    %esi,%ecx
+        andl    CW_RC,%ecx
+        cmpl    RC_RND,%ecx
+        je      LRound_nearest_64
+        cmpl    RC_CHOP,%ecx
+        je      LCheck_truncate_64
+        cmpl    RC_UP,%ecx              /* Towards +infinity */
+        je      LUp_64
+        cmpl    RC_DOWN,%ecx            /* Towards -infinity */
+        je      LDown_64
+#ifdef PARANOID
+        jmp     L_bugged_round64
+#endif /* PARANOID */ 
+LUp_64:
+        cmpb    SIGN_POS,PARAM5
+        jne     LCheck_truncate_64      /* If negative then  up==truncate */
+        orl     %edx,%edx
+        jnz     LDo_64_round_up
+        jmp     L_Re_normalise
+LDown_64:
+        cmpb    SIGN_POS,PARAM5
+        je      LCheck_truncate_64      /* If positive then  down==truncate */
+        orl     %edx,%edx
+        jnz     LDo_64_round_up
+        jmp     L_Re_normalise
+LRound_nearest_64:
+        cmpl    $0x80000000,%edx
+        jc      LCheck_truncate_64
+        jne     LDo_64_round_up
+        /* Now test for round-to-even */
+        testb   $1,%bl
+        jz      LCheck_truncate_64
+LDo_64_round_up:
+        movb    LOST_UP,FPU_bits_lost
+        addl    $1,%ebx
+        adcl    $0,%eax
+LCheck_Round_Overflow:
+        jnc     L_Re_normalise
+        /* Overflow, adjust the result (significand to 1.0) */
+        rcrl    $1,%eax
+        rcrl    $1,%ebx
+        incw    EXP(%edi)
+        jmp     L_Re_normalise
+LCheck_truncate_64:
+        orl     %edx,%edx
+        jz      L_Re_normalise
+LTruncate_64:
+        movb    LOST_DOWN,FPU_bits_lost
+L_Re_normalise:
+        testb   $0xff,FPU_denormal
+        jnz     Normalise_result
+L_Normalised:
+        movl    TAG_Valid,%edx
+L_deNormalised:
+        cmpb    LOST_UP,FPU_bits_lost
+        je      L_precision_lost_up
+        cmpb    LOST_DOWN,FPU_bits_lost
+        je      L_precision_lost_down
+L_no_precision_loss:
+        /* store the result */
+L_Store_significand:
+        movl    %eax,SIGH(%edi)
+        movl    %ebx,SIGL(%edi)
+        cmpw    EXP_OVER,EXP(%edi)
+        jge     L_overflow
+        movl    %edx,%eax
+        /* Convert the exponent to 80x87 form. */
+        addw    EXTENDED_Ebias,EXP(%edi)
+        andw    $0x7fff,EXP(%edi)
+fpu_reg_round_signed_special_exit:
+        cmpb    SIGN_POS,PARAM5
+        je      fpu_reg_round_special_exit
+        orw     $0x8000,EXP(%edi)       /* Negative sign for the result. */
+fpu_reg_round_special_exit:
+#ifndef NON_REENTRANT_FPU
+        popl    %ebx            /* adjust the stack pointer */
+#endif /* NON_REENTRANT_FPU */ 
+fpu_Arith_exit:
+        popl    %ebx
+        popl    %edi
+        popl    %esi
+        leave
+        ret
+/*
+ * Set the FPU status flags to represent precision loss due to
+ * round-up.
+ */
+L_precision_lost_up:
+        push    %edx
+        push    %eax
+        call    set_precision_flag_up
+        popl    %eax
+        popl    %edx
+        jmp     L_no_precision_loss
+/*
+ * Set the FPU status flags to represent precision loss due to
+ * truncation.
+ */
+L_precision_lost_down:
+        push    %edx
+        push    %eax
+        call    set_precision_flag_down
+        popl    %eax
+        popl    %edx
+        jmp     L_no_precision_loss
+/*
+ * The number is a denormal (which might get rounded up to a normal)
+ * Shift the number right the required number of bits, which will
+ * have to be undone later...
+ */
+L_Make_denorm:
+        /* The action to be taken depends upon whether the underflow
+           exception is masked */
+        testb   CW_Underflow,%cl                /* Underflow mask. */
+        jz      Unmasked_underflow              /* Do not make a denormal. */
+        movb    DENORMAL,FPU_denormal
+        pushl   %ecx            /* Save */
+        movw    EXP_UNDER+1,%cx
+        subw    EXP(%edi),%cx
+        cmpw    $64,%cx /* shrd only works for 0..31 bits */
+        jnc     Denorm_shift_more_than_63
+        cmpw    $32,%cx /* shrd only works for 0..31 bits */
+        jnc     Denorm_shift_more_than_32
+/*
+ * We got here without jumps by assuming that the most common requirement
+ *   is for a small de-normalising shift.
+ * Shift by [1..31] bits
+ */
+        addw    %cx,EXP(%edi)
+        orl     %edx,%edx       /* extension */
+        setne   %ch             /* Save whether %edx is non-zero */
+        xorl    %edx,%edx
+        shrd    %cl,%ebx,%edx
+        shrd    %cl,%eax,%ebx
+        shr     %cl,%eax
+        orb     %ch,%dl
+        popl    %ecx
+        jmp     Denorm_done
+/* Shift by [32..63] bits */
+Denorm_shift_more_than_32:
+        addw    %cx,EXP(%edi)
+        subb    $32,%cl
+        orl     %edx,%edx
+        setne   %ch
+        orb     %ch,%bl
+        xorl    %edx,%edx
+        shrd    %cl,%ebx,%edx
+        shrd    %cl,%eax,%ebx
+        shr     %cl,%eax
+        orl     %edx,%edx               /* test these 32 bits */
+        setne   %cl
+        orb     %ch,%bl
+        orb     %cl,%bl
+        movl    %ebx,%edx
+        movl    %eax,%ebx
+        xorl    %eax,%eax
+        popl    %ecx
+        jmp     Denorm_done
+/* Shift by [64..) bits */
+Denorm_shift_more_than_63:
+        cmpw    $64,%cx
+        jne     Denorm_shift_more_than_64
+/* Exactly 64 bit shift */
+        addw    %cx,EXP(%edi)
+        xorl    %ecx,%ecx
+        orl     %edx,%edx
+        setne   %cl
+        orl     %ebx,%ebx
+        setne   %ch
+        orb     %ch,%cl
+        orb     %cl,%al
+        movl    %eax,%edx
+        xorl    %eax,%eax
+        xorl    %ebx,%ebx
+        popl    %ecx
+        jmp     Denorm_done
+Denorm_shift_more_than_64:
+        movw    EXP_UNDER+1,EXP(%edi)
+/* This is easy, %eax must be non-zero, so.. */
+        movl    $1,%edx
+        xorl    %eax,%eax
+        xorl    %ebx,%ebx
+        popl    %ecx
+        jmp     Denorm_done
+Unmasked_underflow:
+        movb    UNMASKED_UNDERFLOW,FPU_denormal
+        jmp     Denorm_done
+/* Undo the de-normalisation. */
+Normalise_result:
+        cmpb    UNMASKED_UNDERFLOW,FPU_denormal
+        je      Signal_underflow
+/* The number must be a denormal if we got here. */
+#ifdef PARANOID
+        /* But check it... just in case. */
+        cmpw    EXP_UNDER+1,EXP(%edi)
+        jne     L_norm_bugged
+#endif /* PARANOID */
+#ifdef PECULIAR_486
+        /*
+         * This implements a special feature of 80486 behaviour.
+         * Underflow will be signalled even if the number is
+         * not a denormal after rounding.
+         * This difference occurs only for masked underflow, and not
+         * in the unmasked case.
+         * Actual 80486 behaviour differs from this in some circumstances.
+         */
+        orl     %eax,%eax               /* ms bits */
+        js      LPseudoDenormal         /* Will be masked underflow */
+#else
+        orl     %eax,%eax               /* ms bits */
+        js      L_Normalised            /* No longer a denormal */
+#endif /* PECULIAR_486 */ 
+        jnz     LDenormal_adj_exponent
+        orl     %ebx,%ebx
+        jz      L_underflow_to_zero     /* The contents are zero */
+LDenormal_adj_exponent:
+        decw    EXP(%edi)
+LPseudoDenormal:
+        testb   $0xff,FPU_bits_lost     /* bits lost == underflow */
+        movl    TAG_Special,%edx
+        jz      L_deNormalised
+        /* There must be a masked underflow */
+        push    %eax
+        pushl   EX_Underflow
+        call    EXCEPTION
+        popl    %eax
+        popl    %eax
+        movl    TAG_Special,%edx
+        jmp     L_deNormalised
+/*
+ * The operations resulted in a number too small to represent.
+ * Masked response.
+ */
+L_underflow_to_zero:
+        push    %eax
+        call    set_precision_flag_down
+        popl    %eax
+        push    %eax
+        pushl   EX_Underflow
+        call    EXCEPTION
+        popl    %eax
+        popl    %eax
+/* Reduce the exponent to EXP_UNDER */
+        movw    EXP_UNDER,EXP(%edi)
+        movl    TAG_Zero,%edx
+        jmp     L_Store_significand
+/* The operations resulted in a number too large to represent. */
+L_overflow:
+        addw    EXTENDED_Ebias,EXP(%edi)        /* Set for unmasked response. */
+        push    %edi
+        call    arith_overflow
+        pop     %edi
+        jmp     fpu_reg_round_signed_special_exit
+Signal_underflow:
+        /* The number may have been changed to a non-denormal */
+        /* by the rounding operations. */
+        cmpw    EXP_UNDER,EXP(%edi)
+        jle     Do_unmasked_underflow
+        jmp     L_Normalised
+Do_unmasked_underflow:
+        /* Increase the exponent by the magic number */
+        addw    $(3*(1<<13)),EXP(%edi)
+        push    %eax
+        pushl   EX_Underflow
+        call    EXCEPTION
+        popl    %eax
+        popl    %eax
+        jmp     L_Normalised
+#ifdef PARANOID
+#ifdef PECULIAR_486
+L_bugged_denorm_486:
+        pushl   EX_INTERNAL|0x236
+        call    EXCEPTION
+        popl    %ebx
+        jmp     L_exception_exit
+#else
+L_bugged_denorm:
+        pushl   EX_INTERNAL|0x230
+        call    EXCEPTION
+        popl    %ebx
+        jmp     L_exception_exit
+#endif /* PECULIAR_486 */ 
+L_bugged_round24:
+        pushl   EX_INTERNAL|0x231
+        call    EXCEPTION
+        popl    %ebx
+        jmp     L_exception_exit
+L_bugged_round53:
+        pushl   EX_INTERNAL|0x232
+        call    EXCEPTION
+        popl    %ebx
+        jmp     L_exception_exit
+L_bugged_round64:
+        pushl   EX_INTERNAL|0x233
+        call    EXCEPTION
+        popl    %ebx
+        jmp     L_exception_exit
+L_norm_bugged:
+        pushl   EX_INTERNAL|0x234
+        call    EXCEPTION
+        popl    %ebx
+        jmp     L_exception_exit
+L_entry_bugged:
+        pushl   EX_INTERNAL|0x235
+        call    EXCEPTION
+        popl    %ebx
+L_exception_exit:
+        mov     $-1,%eax
+        jmp     fpu_reg_round_special_exit
+#endif /* PARANOID */ 
diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S
new file mode 100644
index 000000000000..47c4c2434d85
--- /dev/null
+++ b/arch/x86/math-emu/reg_u_add.S
@@ -0,0 +1,167 @@
+        .file   "reg_u_add.S"
+/*---------------------------------------------------------------------------+
+ |  reg_u_add.S                                                              |
+ |                                                                           |
+ | Add two valid (TAG_Valid) FPU_REG numbers, of the same sign, and put the  |
+ |   result in a destination FPU_REG.                                        |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1995,1997                                         |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ | Call from C as:                                                           |
+ |   int  FPU_u_add(FPU_REG *arg1, FPU_REG *arg2, FPU_REG *answ,             |
+ |                                                int control_w)             |
+ |    Return value is the tag of the answer, or-ed with FPU_Exception if     |
+ |    one was raised, or -1 on internal error.                               |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*
+ |    Kernel addition routine FPU_u_add(reg *arg1, reg *arg2, reg *answ).
+ |    Takes two valid reg f.p. numbers (TAG_Valid), which are
+ |    treated as unsigned numbers,
+ |    and returns their sum as a TAG_Valid or TAG_Special f.p. number.
+ |    The returned number is normalized.
+ |    Basic checks are performed if PARANOID is defined.
+ */
+#include "exception.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+.text
+ENTRY(FPU_u_add)
+        pushl   %ebp
+        movl    %esp,%ebp
+        pushl   %esi
+        pushl   %edi
+        pushl   %ebx
+        movl    PARAM1,%esi             /* source 1 */
+        movl    PARAM2,%edi             /* source 2 */
+        movl    PARAM6,%ecx
+        movl    %ecx,%edx
+        subl    PARAM7,%ecx                     /* exp1 - exp2 */
+        jge     L_arg1_larger
+        /* num1 is smaller */
+        movl    SIGL(%esi),%ebx
+        movl    SIGH(%esi),%eax
+        movl    %edi,%esi
+        movl    PARAM7,%edx
+        negw    %cx
+        jmp     L_accum_loaded
+L_arg1_larger:
+        /* num1 has larger or equal exponent */
+        movl    SIGL(%edi),%ebx
+        movl    SIGH(%edi),%eax
+L_accum_loaded:
+        movl    PARAM3,%edi             /* destination */
+        movw    %dx,EXP(%edi)           /* Copy exponent to destination */
+        xorl    %edx,%edx               /* clear the extension */
+#ifdef PARANOID
+        testl   $0x80000000,%eax
+        je      L_bugged
+        testl   $0x80000000,SIGH(%esi)
+        je      L_bugged
+#endif /* PARANOID */
+/* The number to be shifted is in %eax:%ebx:%edx */
+        cmpw    $32,%cx         /* shrd only works for 0..31 bits */
+        jnc     L_more_than_31
+/* less than 32 bits */
+        shrd    %cl,%ebx,%edx
+        shrd    %cl,%eax,%ebx
+        shr     %cl,%eax
+        jmp     L_shift_done
+L_more_than_31:
+        cmpw    $64,%cx
+        jnc     L_more_than_63
+        subb    $32,%cl
+        jz      L_exactly_32
+        shrd    %cl,%eax,%edx
+        shr     %cl,%eax
+        orl     %ebx,%ebx
+        jz      L_more_31_no_low        /* none of the lowest bits is set */
+        orl     $1,%edx                 /* record the fact in the extension */
+L_more_31_no_low:
+        movl    %eax,%ebx
+        xorl    %eax,%eax
+        jmp     L_shift_done
+L_exactly_32:
+        movl    %ebx,%edx
+        movl    %eax,%ebx
+        xorl    %eax,%eax
+        jmp     L_shift_done
+L_more_than_63:
+        cmpw    $65,%cx
+        jnc     L_more_than_64
+        movl    %eax,%edx
+        orl     %ebx,%ebx
+        jz      L_more_63_no_low
+        orl     $1,%edx
+        jmp     L_more_63_no_low
+L_more_than_64:
+        movl    $1,%edx         /* The shifted nr always at least one '1' */
+L_more_63_no_low:
+        xorl    %ebx,%ebx
+        xorl    %eax,%eax
+L_shift_done:
+        /* Now do the addition */
+        addl    SIGL(%esi),%ebx
+        adcl    SIGH(%esi),%eax
+        jnc     L_round_the_result
+        /* Overflow, adjust the result */
+        rcrl    $1,%eax
+        rcrl    $1,%ebx
+        rcrl    $1,%edx
+        jnc     L_no_bit_lost
+        orl     $1,%edx
+L_no_bit_lost:
+        incw    EXP(%edi)
+L_round_the_result:
+        jmp     fpu_reg_round   /* Round the result */
+#ifdef PARANOID
+/* If we ever get here then we have problems! */
+L_bugged:
+        pushl   EX_INTERNAL|0x201
+        call    EXCEPTION
+        pop     %ebx
+        movl    $-1,%eax
+        jmp     L_exit
+L_exit:
+        popl    %ebx
+        popl    %edi
+        popl    %esi
+        leave
+        ret
+#endif /* PARANOID */
diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S
new file mode 100644
index 000000000000..cc00654b6f9a
--- /dev/null
+++ b/arch/x86/math-emu/reg_u_div.S
@@ -0,0 +1,471 @@
+        .file   "reg_u_div.S"
+/*---------------------------------------------------------------------------+
+ |  reg_u_div.S                                                              |
+ |                                                                           |
+ | Divide one FPU_REG by another and put the result in a destination FPU_REG.|
+ |                                                                           |
+ | Copyright (C) 1992,1993,1995,1997                                         |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ | Call from C as:                                                           |
+ |    int FPU_u_div(FPU_REG *a, FPU_REG *b, FPU_REG *dest,                   |
+ |                unsigned int control_word, char *sign)                     |
+ |                                                                           |
+ |  Does not compute the destination exponent, but does adjust it.           |
+ |                                                                           |
+ |    Return value is the tag of the answer, or-ed with FPU_Exception if     |
+ |    one was raised, or -1 on internal error.                               |
+ +---------------------------------------------------------------------------*/
+#include "exception.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+/* #define      dSIGL(x)        (x) */
+/* #define      dSIGH(x)        4(x) */
+#ifndef NON_REENTRANT_FPU
+/*
+        Local storage on the stack:
+        Result:         FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0
+        Overflow flag:  ovfl_flag
+ */
+#define FPU_accum_3     -4(%ebp)
+#define FPU_accum_2     -8(%ebp)
+#define FPU_accum_1     -12(%ebp)
+#define FPU_accum_0     -16(%ebp)
+#define FPU_result_1    -20(%ebp)
+#define FPU_result_2    -24(%ebp)
+#define FPU_ovfl_flag   -28(%ebp)
+#else
+.data
+/*
+        Local storage in a static area:
+        Result:         FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0
+        Overflow flag:  ovfl_flag
+ */
+        .align 4,0
+FPU_accum_3:
+        .long   0
+FPU_accum_2:
+        .long   0
+FPU_accum_1:
+        .long   0
+FPU_accum_0:
+        .long   0
+FPU_result_1:
+        .long   0
+FPU_result_2:
+        .long   0
+FPU_ovfl_flag:
+        .byte   0
+#endif /* NON_REENTRANT_FPU */
+#define REGA    PARAM1
+#define REGB    PARAM2
+#define DEST    PARAM3
+.text
+ENTRY(FPU_u_div)
+        pushl   %ebp
+        movl    %esp,%ebp
+#ifndef NON_REENTRANT_FPU
+        subl    $28,%esp
+#endif /* NON_REENTRANT_FPU */
+        pushl   %esi
+        pushl   %edi
+        pushl   %ebx
+        movl    REGA,%esi
+        movl    REGB,%ebx
+        movl    DEST,%edi
+        movswl  EXP(%esi),%edx
+        movswl  EXP(%ebx),%eax
+        subl    %eax,%edx
+        addl    EXP_BIAS,%edx
+        /* A denormal and a large number can cause an exponent underflow */
+        cmpl    EXP_WAY_UNDER,%edx
+        jg      xExp_not_underflow
+        /* Set to a really low value allow correct handling */
+        movl    EXP_WAY_UNDER,%edx
+xExp_not_underflow:
+        movw    %dx,EXP(%edi)
+#ifdef PARANOID
+/*      testl   $0x80000000, SIGH(%esi) // Dividend */
+/*      je      L_bugged */
+        testl   $0x80000000, SIGH(%ebx) /* Divisor */
+        je      L_bugged
+#endif /* PARANOID */ 
+/* Check if the divisor can be treated as having just 32 bits */
+        cmpl    $0,SIGL(%ebx)
+        jnz     L_Full_Division /* Can't do a quick divide */
+/* We should be able to zip through the division here */
+        movl    SIGH(%ebx),%ecx /* The divisor */
+        movl    SIGH(%esi),%edx /* Dividend */
+        movl    SIGL(%esi),%eax /* Dividend */
+        cmpl    %ecx,%edx
+        setaeb  FPU_ovfl_flag   /* Keep a record */
+        jb      L_no_adjust
+        subl    %ecx,%edx       /* Prevent the overflow */
+L_no_adjust:
+        /* Divide the 64 bit number by the 32 bit denominator */
+        divl    %ecx
+        movl    %eax,FPU_result_2
+        /* Work on the remainder of the first division */
+        xorl    %eax,%eax
+        divl    %ecx
+        movl    %eax,FPU_result_1
+        /* Work on the remainder of the 64 bit division */
+        xorl    %eax,%eax
+        divl    %ecx
+        testb   $255,FPU_ovfl_flag      /* was the num > denom ? */
+        je      L_no_overflow
+        /* Do the shifting here */
+        /* increase the exponent */
+        incw    EXP(%edi)
+        /* shift the mantissa right one bit */
+        stc                     /* To set the ms bit */
+        rcrl    FPU_result_2
+        rcrl    FPU_result_1
+        rcrl    %eax
+L_no_overflow:
+        jmp     LRound_precision        /* Do the rounding as required */
+/*---------------------------------------------------------------------------+
+ |  Divide:   Return  arg1/arg2 to arg3.                                     |
+ |                                                                           |
+ |  This routine does not use the exponents of arg1 and arg2, but does       |
+ |  adjust the exponent of arg3.                                             |
+ |                                                                           |
+ |  The maximum returned value is (ignoring exponents)                       |
+ |               .ffffffff ffffffff                                          |
+ |               ------------------  =  1.ffffffff fffffffe                  |
+ |               .80000000 00000000                                          |
+ | and the minimum is                                                        |
+ |               .80000000 00000000                                          |
+ |               ------------------  =  .80000000 00000001   (rounded)       |
+ |               .ffffffff ffffffff                                          |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+L_Full_Division:
+        /* Save extended dividend in local register */
+        movl    SIGL(%esi),%eax
+        movl    %eax,FPU_accum_2
+        movl    SIGH(%esi),%eax
+        movl    %eax,FPU_accum_3
+        xorl    %eax,%eax
+        movl    %eax,FPU_accum_1        /* zero the extension */
+        movl    %eax,FPU_accum_0        /* zero the extension */
+        movl    SIGL(%esi),%eax /* Get the current num */
+        movl    SIGH(%esi),%edx
+/*----------------------------------------------------------------------*/
+/* Initialization done.
+   Do the first 32 bits. */
+        movb    $0,FPU_ovfl_flag
+        cmpl    SIGH(%ebx),%edx /* Test for imminent overflow */
+        jb      LLess_than_1
+        ja      LGreater_than_1
+        cmpl    SIGL(%ebx),%eax
+        jb      LLess_than_1
+LGreater_than_1:
+/* The dividend is greater or equal, would cause overflow */
+        setaeb  FPU_ovfl_flag           /* Keep a record */
+        subl    SIGL(%ebx),%eax
+        sbbl    SIGH(%ebx),%edx /* Prevent the overflow */
+        movl    %eax,FPU_accum_2
+        movl    %edx,FPU_accum_3
+LLess_than_1:
+/* At this point, we have a dividend < divisor, with a record of
+   adjustment in FPU_ovfl_flag */
+        /* We will divide by a number which is too large */
+        movl    SIGH(%ebx),%ecx
+        addl    $1,%ecx
+        jnc     LFirst_div_not_1
+        /* here we need to divide by 100000000h,
+           i.e., no division at all.. */
+        mov     %edx,%eax
+        jmp     LFirst_div_done
+LFirst_div_not_1:
+        divl    %ecx            /* Divide the numerator by the augmented
+                                   denom ms dw */
+LFirst_div_done:
+        movl    %eax,FPU_result_2       /* Put the result in the answer */
+        mull    SIGH(%ebx)      /* mul by the ms dw of the denom */
+        subl    %eax,FPU_accum_2        /* Subtract from the num local reg */
+        sbbl    %edx,FPU_accum_3
+        movl    FPU_result_2,%eax       /* Get the result back */
+        mull    SIGL(%ebx)      /* now mul the ls dw of the denom */
+        subl    %eax,FPU_accum_1        /* Subtract from the num local reg */
+        sbbl    %edx,FPU_accum_2
+        sbbl    $0,FPU_accum_3
+        je      LDo_2nd_32_bits         /* Must check for non-zero result here */
+#ifdef PARANOID
+        jb      L_bugged_1
+#endif /* PARANOID */ 
+        /* need to subtract another once of the denom */
+        incl    FPU_result_2    /* Correct the answer */
+        movl    SIGL(%ebx),%eax
+        movl    SIGH(%ebx),%edx
+        subl    %eax,FPU_accum_1        /* Subtract from the num local reg */
+        sbbl    %edx,FPU_accum_2
+#ifdef PARANOID
+        sbbl    $0,FPU_accum_3
+        jne     L_bugged_1      /* Must check for non-zero result here */
+#endif /* PARANOID */ 
+/*----------------------------------------------------------------------*/
+/* Half of the main problem is done, there is just a reduced numerator
+   to handle now.
+   Work with the second 32 bits, FPU_accum_0 not used from now on */
+LDo_2nd_32_bits:
+        movl    FPU_accum_2,%edx        /* get the reduced num */
+        movl    FPU_accum_1,%eax
+        /* need to check for possible subsequent overflow */
+        cmpl    SIGH(%ebx),%edx
+        jb      LDo_2nd_div
+        ja      LPrevent_2nd_overflow
+        cmpl    SIGL(%ebx),%eax
+        jb      LDo_2nd_div
+LPrevent_2nd_overflow:
+/* The numerator is greater or equal, would cause overflow */
+        /* prevent overflow */
+        subl    SIGL(%ebx),%eax
+        sbbl    SIGH(%ebx),%edx
+        movl    %edx,FPU_accum_2
+        movl    %eax,FPU_accum_1
+        incl    FPU_result_2    /* Reflect the subtraction in the answer */
+#ifdef PARANOID
+        je      L_bugged_2      /* Can't bump the result to 1.0 */
+#endif /* PARANOID */ 
+LDo_2nd_div:
+        cmpl    $0,%ecx         /* augmented denom msw */
+        jnz     LSecond_div_not_1
+        /* %ecx == 0, we are dividing by 1.0 */
+        mov     %edx,%eax
+        jmp     LSecond_div_done
+LSecond_div_not_1:
+        divl    %ecx            /* Divide the numerator by the denom ms dw */
+LSecond_div_done:
+        movl    %eax,FPU_result_1       /* Put the result in the answer */
+        mull    SIGH(%ebx)      /* mul by the ms dw of the denom */
+        subl    %eax,FPU_accum_1        /* Subtract from the num local reg */
+        sbbl    %edx,FPU_accum_2
+#ifdef PARANOID
+        jc      L_bugged_2
+#endif /* PARANOID */ 
+        movl    FPU_result_1,%eax       /* Get the result back */
+        mull    SIGL(%ebx)      /* now mul the ls dw of the denom */
+        subl    %eax,FPU_accum_0        /* Subtract from the num local reg */
+        sbbl    %edx,FPU_accum_1        /* Subtract from the num local reg */
+        sbbl    $0,FPU_accum_2
+#ifdef PARANOID
+        jc      L_bugged_2
+#endif /* PARANOID */ 
+        jz      LDo_3rd_32_bits
+#ifdef PARANOID
+        cmpl    $1,FPU_accum_2
+        jne     L_bugged_2
+#endif /* PARANOID */
+        /* need to subtract another once of the denom */
+        movl    SIGL(%ebx),%eax
+        movl    SIGH(%ebx),%edx
+        subl    %eax,FPU_accum_0        /* Subtract from the num local reg */
+        sbbl    %edx,FPU_accum_1
+        sbbl    $0,FPU_accum_2
+#ifdef PARANOID
+        jc      L_bugged_2
+        jne     L_bugged_2
+#endif /* PARANOID */ 
+        addl    $1,FPU_result_1 /* Correct the answer */
+        adcl    $0,FPU_result_2
+#ifdef PARANOID
+        jc      L_bugged_2      /* Must check for non-zero result here */
+#endif /* PARANOID */
+/*----------------------------------------------------------------------*/
+/* The division is essentially finished here, we just need to perform
+   tidying operations.
+   Deal with the 3rd 32 bits */
+LDo_3rd_32_bits:
+        movl    FPU_accum_1,%edx                /* get the reduced num */
+        movl    FPU_accum_0,%eax
+        /* need to check for possible subsequent overflow */
+        cmpl    SIGH(%ebx),%edx /* denom */
+        jb      LRound_prep
+        ja      LPrevent_3rd_overflow
+        cmpl    SIGL(%ebx),%eax /* denom */
+        jb      LRound_prep
+LPrevent_3rd_overflow:
+        /* prevent overflow */
+        subl    SIGL(%ebx),%eax
+        sbbl    SIGH(%ebx),%edx
+        movl    %edx,FPU_accum_1
+        movl    %eax,FPU_accum_0
+        addl    $1,FPU_result_1 /* Reflect the subtraction in the answer */
+        adcl    $0,FPU_result_2
+        jne     LRound_prep
+        jnc     LRound_prep
+        /* This is a tricky spot, there is an overflow of the answer */
+        movb    $255,FPU_ovfl_flag              /* Overflow -> 1.000 */
+LRound_prep:
+/*
+ * Prepare for rounding.
+ * To test for rounding, we just need to compare 2*accum with the
+ * denom.
+ */
+        movl    FPU_accum_0,%ecx
+        movl    FPU_accum_1,%edx
+        movl    %ecx,%eax
+        orl     %edx,%eax
+        jz      LRound_ovfl             /* The accumulator contains zero. */
+        /* Multiply by 2 */
+        clc
+        rcll    $1,%ecx
+        rcll    $1,%edx
+        jc      LRound_large            /* No need to compare, denom smaller */
+        subl    SIGL(%ebx),%ecx
+        sbbl    SIGH(%ebx),%edx
+        jnc     LRound_not_small
+        movl    $0x70000000,%eax        /* Denom was larger */
+        jmp     LRound_ovfl
+LRound_not_small:
+        jnz     LRound_large
+        movl    $0x80000000,%eax        /* Remainder was exactly 1/2 denom */
+        jmp     LRound_ovfl
+LRound_large:
+        movl    $0xff000000,%eax        /* Denom was smaller */
+LRound_ovfl:
+/* We are now ready to deal with rounding, but first we must get
+   the bits properly aligned */
+        testb   $255,FPU_ovfl_flag      /* was the num > denom ? */
+        je      LRound_precision
+        incw    EXP(%edi)
+        /* shift the mantissa right one bit */
+        stc                     /* Will set the ms bit */
+        rcrl    FPU_result_2
+        rcrl    FPU_result_1
+        rcrl    %eax
+/* Round the result as required */
+LRound_precision:
+        decw    EXP(%edi)       /* binary point between 1st & 2nd bits */
+        movl    %eax,%edx
+        movl    FPU_result_1,%ebx
+        movl    FPU_result_2,%eax
+        jmp     fpu_reg_round
+#ifdef PARANOID
+/* The logic is wrong if we got here */
+L_bugged:
+        pushl   EX_INTERNAL|0x202
+        call    EXCEPTION
+        pop     %ebx
+        jmp     L_exit
+L_bugged_1:
+        pushl   EX_INTERNAL|0x203
+        call    EXCEPTION
+        pop     %ebx
+        jmp     L_exit
+L_bugged_2:
+        pushl   EX_INTERNAL|0x204
+        call    EXCEPTION
+        pop     %ebx
+        jmp     L_exit
+L_exit:
+        movl    $-1,%eax
+        popl    %ebx
+        popl    %edi
+        popl    %esi
+        leave
+        ret
+#endif /* PARANOID */ 
diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S
new file mode 100644
index 000000000000..973f12af97df
--- /dev/null
+++ b/arch/x86/math-emu/reg_u_mul.S
@@ -0,0 +1,148 @@
+        .file   "reg_u_mul.S"
+/*---------------------------------------------------------------------------+
+ |  reg_u_mul.S                                                              |
+ |                                                                           |
+ | Core multiplication routine                                               |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1995,1997                                         |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ |   Basic multiplication routine.                                           |
+ |   Does not check the resulting exponent for overflow/underflow            |
+ |                                                                           |
+ |   FPU_u_mul(FPU_REG *a, FPU_REG *b, FPU_REG *c, unsigned int cw);         |
+ |                                                                           |
+ |   Internal working is at approx 128 bits.                                 |
+ |   Result is rounded to nearest 53 or 64 bits, using "nearest or even".    |
+ +---------------------------------------------------------------------------*/
+#include "exception.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+#ifndef NON_REENTRANT_FPU
+/*  Local storage on the stack: */
+#define FPU_accum_0     -4(%ebp)        /* ms word */
+#define FPU_accum_1     -8(%ebp)
+#else
+/*  Local storage in a static area: */
+.data
+        .align 4,0
+FPU_accum_0:
+        .long   0
+FPU_accum_1:
+        .long   0
+#endif /* NON_REENTRANT_FPU */
+.text
+ENTRY(FPU_u_mul)
+        pushl   %ebp
+        movl    %esp,%ebp
+#ifndef NON_REENTRANT_FPU
+        subl    $8,%esp
+#endif /* NON_REENTRANT_FPU */ 
+        pushl   %esi
+        pushl   %edi
+        pushl   %ebx
+        movl    PARAM1,%esi
+        movl    PARAM2,%edi
+#ifdef PARANOID
+        testl   $0x80000000,SIGH(%esi)
+        jz      L_bugged
+        testl   $0x80000000,SIGH(%edi)
+        jz      L_bugged
+#endif /* PARANOID */
+        xorl    %ecx,%ecx
+        xorl    %ebx,%ebx
+        movl    SIGL(%esi),%eax
+        mull    SIGL(%edi)
+        movl    %eax,FPU_accum_0
+        movl    %edx,FPU_accum_1
+        movl    SIGL(%esi),%eax
+        mull    SIGH(%edi)
+        addl    %eax,FPU_accum_1
+        adcl    %edx,%ebx
+/*      adcl    $0,%ecx         // overflow here is not possible */
+        movl    SIGH(%esi),%eax
+        mull    SIGL(%edi)
+        addl    %eax,FPU_accum_1
+        adcl    %edx,%ebx
+        adcl    $0,%ecx
+        movl    SIGH(%esi),%eax
+        mull    SIGH(%edi)
+        addl    %eax,%ebx
+        adcl    %edx,%ecx
+        /* Get the sum of the exponents. */
+        movl    PARAM6,%eax
+        subl    EXP_BIAS-1,%eax
+        /* Two denormals can cause an exponent underflow */
+        cmpl    EXP_WAY_UNDER,%eax
+        jg      Exp_not_underflow
+        /* Set to a really low value allow correct handling */
+        movl    EXP_WAY_UNDER,%eax
+Exp_not_underflow:
+/*  Have now finished with the sources */
+        movl    PARAM3,%edi     /* Point to the destination */
+        movw    %ax,EXP(%edi)
+/*  Now make sure that the result is normalized */
+        testl   $0x80000000,%ecx
+        jnz     LResult_Normalised
+        /* Normalize by shifting left one bit */
+        shll    $1,FPU_accum_0
+        rcll    $1,FPU_accum_1
+        rcll    $1,%ebx
+        rcll    $1,%ecx
+        decw    EXP(%edi)
+LResult_Normalised:
+        movl    FPU_accum_0,%eax
+        movl    FPU_accum_1,%edx
+        orl     %eax,%eax
+        jz      L_extent_zero
+        orl     $1,%edx
+L_extent_zero:
+        movl    %ecx,%eax
+        jmp     fpu_reg_round
+#ifdef PARANOID
+L_bugged:
+        pushl   EX_INTERNAL|0x205
+        call    EXCEPTION
+        pop     %ebx
+        jmp     L_exit
+L_exit:
+        popl    %ebx
+        popl    %edi
+        popl    %esi
+        leave
+        ret
+#endif /* PARANOID */ 
diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S
new file mode 100644
index 000000000000..1b6c24801d22
--- /dev/null
+++ b/arch/x86/math-emu/reg_u_sub.S
@@ -0,0 +1,272 @@
+        .file   "reg_u_sub.S"
+/*---------------------------------------------------------------------------+
+ |  reg_u_sub.S                                                              |
+ |                                                                           |
+ | Core floating point subtraction routine.                                  |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1995,1997                                         |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@suburbia.net                              |
+ |                                                                           |
+ | Call from C as:                                                           |
+ |    int FPU_u_sub(FPU_REG *arg1, FPU_REG *arg2, FPU_REG *answ,             |
+ |                                                int control_w)             |
+ |    Return value is the tag of the answer, or-ed with FPU_Exception if     |
+ |    one was raised, or -1 on internal error.                               |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*
+ |    Kernel subtraction routine FPU_u_sub(reg *arg1, reg *arg2, reg *answ).
+ |    Takes two valid reg f.p. numbers (TAG_Valid), which are
+ |    treated as unsigned numbers,
+ |    and returns their difference as a TAG_Valid or TAG_Zero f.p.
+ |    number.
+ |    The first number (arg1) must be the larger.
+ |    The returned number is normalized.
+ |    Basic checks are performed if PARANOID is defined.
+ */
+#include "exception.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+.text
+ENTRY(FPU_u_sub)
+        pushl   %ebp
+        movl    %esp,%ebp
+        pushl   %esi
+        pushl   %edi
+        pushl   %ebx
+        movl    PARAM1,%esi     /* source 1 */
+        movl    PARAM2,%edi     /* source 2 */
+        
+        movl    PARAM6,%ecx
+        subl    PARAM7,%ecx     /* exp1 - exp2 */
+#ifdef PARANOID
+        /* source 2 is always smaller than source 1 */
+        js      L_bugged_1
+        testl   $0x80000000,SIGH(%edi)  /* The args are assumed to be be normalized */
+        je      L_bugged_2
+        testl   $0x80000000,SIGH(%esi)
+        je      L_bugged_2
+#endif /* PARANOID */
+/*--------------------------------------+
+ |      Form a register holding the     |
+ |      smaller number                  |
+ +--------------------------------------*/
+        movl    SIGH(%edi),%eax /* register ms word */
+        movl    SIGL(%edi),%ebx /* register ls word */
+        movl    PARAM3,%edi     /* destination */
+        movl    PARAM6,%edx
+        movw    %dx,EXP(%edi)   /* Copy exponent to destination */
+        xorl    %edx,%edx       /* register extension */
+/*--------------------------------------+
+ |      Shift the temporary register    |
+ |      right the required number of    |
+ |      places.                         |
+ +--------------------------------------*/
+        cmpw    $32,%cx         /* shrd only works for 0..31 bits */
+        jnc     L_more_than_31
+/* less than 32 bits */
+        shrd    %cl,%ebx,%edx
+        shrd    %cl,%eax,%ebx
+        shr     %cl,%eax
+        jmp     L_shift_done
+L_more_than_31:
+        cmpw    $64,%cx
+        jnc     L_more_than_63
+        subb    $32,%cl
+        jz      L_exactly_32
+        shrd    %cl,%eax,%edx
+        shr     %cl,%eax
+        orl     %ebx,%ebx
+        jz      L_more_31_no_low        /* none of the lowest bits is set */
+        orl     $1,%edx                 /* record the fact in the extension */
+L_more_31_no_low:
+        movl    %eax,%ebx
+        xorl    %eax,%eax
+        jmp     L_shift_done
+L_exactly_32:
+        movl    %ebx,%edx
+        movl    %eax,%ebx
+        xorl    %eax,%eax
+        jmp     L_shift_done
+L_more_than_63:
+        cmpw    $65,%cx
+        jnc     L_more_than_64
+        /* Shift right by 64 bits */
+        movl    %eax,%edx
+        orl     %ebx,%ebx
+        jz      L_more_63_no_low
+        orl     $1,%edx
+        jmp     L_more_63_no_low
+L_more_than_64:
+        jne     L_more_than_65
+        /* Shift right by 65 bits */
+        /* Carry is clear if we get here */
+        movl    %eax,%edx
+        rcrl    %edx
+        jnc     L_shift_65_nc
+        orl     $1,%edx
+        jmp     L_more_63_no_low
+L_shift_65_nc:
+        orl     %ebx,%ebx
+        jz      L_more_63_no_low
+        orl     $1,%edx
+        jmp     L_more_63_no_low
+L_more_than_65:
+        movl    $1,%edx         /* The shifted nr always at least one '1' */
+L_more_63_no_low:
+        xorl    %ebx,%ebx
+        xorl    %eax,%eax
+L_shift_done:
+L_subtr:
+/*------------------------------+
+ |      Do the subtraction      |
+ +------------------------------*/
+        xorl    %ecx,%ecx
+        subl    %edx,%ecx
+        movl    %ecx,%edx
+        movl    SIGL(%esi),%ecx
+        sbbl    %ebx,%ecx
+        movl    %ecx,%ebx
+        movl    SIGH(%esi),%ecx
+        sbbl    %eax,%ecx
+        movl    %ecx,%eax
+#ifdef PARANOID
+        /* We can never get a borrow */
+        jc      L_bugged
+#endif /* PARANOID */
+/*--------------------------------------+
+ |      Normalize the result            |
+ +--------------------------------------*/
+        testl   $0x80000000,%eax
+        jnz     L_round         /* no shifting needed */
+        orl     %eax,%eax
+        jnz     L_shift_1       /* shift left 1 - 31 bits */
+        orl     %ebx,%ebx
+        jnz     L_shift_32      /* shift left 32 - 63 bits */
+/*
+ *       A rare case, the only one which is non-zero if we got here
+ *         is:           1000000 .... 0000
+ *                      -0111111 .... 1111 1
+ *                       -------------------- 
+ *                       0000000 .... 0000 1 
+ */
+        cmpl    $0x80000000,%edx
+        jnz     L_must_be_zero
+        /* Shift left 64 bits */
+        subw    $64,EXP(%edi)
+        xchg    %edx,%eax
+        jmp     fpu_reg_round
+L_must_be_zero:
+#ifdef PARANOID
+        orl     %edx,%edx
+        jnz     L_bugged_3
+#endif /* PARANOID */ 
+        /* The result is zero */
+        movw    $0,EXP(%edi)            /* exponent */
+        movl    $0,SIGL(%edi)
+        movl    $0,SIGH(%edi)
+        movl    TAG_Zero,%eax
+        jmp     L_exit
+L_shift_32:
+        movl    %ebx,%eax
+        movl    %edx,%ebx
+        movl    $0,%edx
+        subw    $32,EXP(%edi)   /* Can get underflow here */
+/* We need to shift left by 1 - 31 bits */
+L_shift_1:
+        bsrl    %eax,%ecx       /* get the required shift in %ecx */
+        subl    $31,%ecx
+        negl    %ecx
+        shld    %cl,%ebx,%eax
+        shld    %cl,%edx,%ebx
+        shl     %cl,%edx
+        subw    %cx,EXP(%edi)   /* Can get underflow here */
+L_round:
+        jmp     fpu_reg_round   /* Round the result */
+#ifdef PARANOID
+L_bugged_1:
+        pushl   EX_INTERNAL|0x206
+        call    EXCEPTION
+        pop     %ebx
+        jmp     L_error_exit
+L_bugged_2:
+        pushl   EX_INTERNAL|0x209
+        call    EXCEPTION
+        pop     %ebx
+        jmp     L_error_exit
+L_bugged_3:
+        pushl   EX_INTERNAL|0x210
+        call    EXCEPTION
+        pop     %ebx
+        jmp     L_error_exit
+L_bugged_4:
+        pushl   EX_INTERNAL|0x211
+        call    EXCEPTION
+        pop     %ebx
+        jmp     L_error_exit
+L_bugged:
+        pushl   EX_INTERNAL|0x212
+        call    EXCEPTION
+        pop     %ebx
+        jmp     L_error_exit
+L_error_exit:
+        movl    $-1,%eax
+#endif /* PARANOID */
+L_exit:
+        popl    %ebx
+        popl    %edi
+        popl    %esi
+        leave
+        ret
diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S
new file mode 100644
index 000000000000..bbe0e87718e4
--- /dev/null
+++ b/arch/x86/math-emu/round_Xsig.S
@@ -0,0 +1,141 @@
+/*---------------------------------------------------------------------------+
+ |  round_Xsig.S                                                             |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1995                                         |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail billm@jacobi.maths.monash.edu.au |
+ |                                                                           |
+ | Normalize and round a 12 byte quantity.                                   |
+ | Call from C as:                                                           |
+ |   int round_Xsig(Xsig *n)                                                 |
+ |                                                                           |
+ | Normalize a 12 byte quantity.                                             |
+ | Call from C as:                                                           |
+ |   int norm_Xsig(Xsig *n)                                                  |
+ |                                                                           |
+ | Each function returns the size of the shift (nr of bits).                 |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+        .file   "round_Xsig.S"
+#include "fpu_emu.h"
+.text
+ENTRY(round_Xsig)
+        pushl   %ebp
+        movl    %esp,%ebp
+        pushl   %ebx            /* Reserve some space */
+        pushl   %ebx
+        pushl   %esi
+        movl    PARAM1,%esi
+        movl    8(%esi),%edx
+        movl    4(%esi),%ebx
+        movl    (%esi),%eax
+        movl    $0,-4(%ebp)
+        orl     %edx,%edx       /* ms bits */
+        js      L_round         /* Already normalized */
+        jnz     L_shift_1       /* Shift left 1 - 31 bits */
+        movl    %ebx,%edx
+        movl    %eax,%ebx
+        xorl    %eax,%eax
+        movl    $-32,-4(%ebp)
+/* We need to shift left by 1 - 31 bits */
+L_shift_1:
+        bsrl    %edx,%ecx       /* get the required shift in %ecx */
+        subl    $31,%ecx
+        negl    %ecx
+        subl    %ecx,-4(%ebp)
+        shld    %cl,%ebx,%edx
+        shld    %cl,%eax,%ebx
+        shl     %cl,%eax
+L_round:
+        testl   $0x80000000,%eax
+        jz      L_exit
+        addl    $1,%ebx
+        adcl    $0,%edx
+        jnz     L_exit
+        movl    $0x80000000,%edx
+        incl    -4(%ebp)
+L_exit:
+        movl    %edx,8(%esi)
+        movl    %ebx,4(%esi)
+        movl    %eax,(%esi)
+        movl    -4(%ebp),%eax
+        popl    %esi
+        popl    %ebx
+        leave
+        ret
+ENTRY(norm_Xsig)
+        pushl   %ebp
+        movl    %esp,%ebp
+        pushl   %ebx            /* Reserve some space */
+        pushl   %ebx
+        pushl   %esi
+        movl    PARAM1,%esi
+        movl    8(%esi),%edx
+        movl    4(%esi),%ebx
+        movl    (%esi),%eax
+        movl    $0,-4(%ebp)
+        orl     %edx,%edx       /* ms bits */
+        js      L_n_exit                /* Already normalized */
+        jnz     L_n_shift_1     /* Shift left 1 - 31 bits */
+        movl    %ebx,%edx
+        movl    %eax,%ebx
+        xorl    %eax,%eax
+        movl    $-32,-4(%ebp)
+        orl     %edx,%edx       /* ms bits */
+        js      L_n_exit        /* Normalized now */
+        jnz     L_n_shift_1     /* Shift left 1 - 31 bits */
+        movl    %ebx,%edx
+        movl    %eax,%ebx
+        xorl    %eax,%eax
+        addl    $-32,-4(%ebp)
+        jmp     L_n_exit        /* Might not be normalized,
+                                   but shift no more. */
+/* We need to shift left by 1 - 31 bits */
+L_n_shift_1:
+        bsrl    %edx,%ecx       /* get the required shift in %ecx */
+        subl    $31,%ecx
+        negl    %ecx
+        subl    %ecx,-4(%ebp)
+        shld    %cl,%ebx,%edx
+        shld    %cl,%eax,%ebx
+        shl     %cl,%eax
+L_n_exit:
+        movl    %edx,8(%esi)
+        movl    %ebx,4(%esi)
+        movl    %eax,(%esi)
+        movl    -4(%ebp),%eax
+        popl    %esi
+        popl    %ebx
+        leave
+        ret
diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S
new file mode 100644
index 000000000000..31cdd118e918
--- /dev/null
+++ b/arch/x86/math-emu/shr_Xsig.S
@@ -0,0 +1,87 @@
+        .file   "shr_Xsig.S"
+/*---------------------------------------------------------------------------+
+ |  shr_Xsig.S                                                               |
+ |                                                                           |
+ | 12 byte right shift function                                              |
+ |                                                                           |
+ | Copyright (C) 1992,1994,1995                                              |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail billm@jacobi.maths.monash.edu.au |
+ |                                                                           |
+ | Call from C as:                                                           |
+ |   void shr_Xsig(Xsig *arg, unsigned nr)                                   |
+ |                                                                           |
+ |   Extended shift right function.                                          |
+ |   Fastest for small shifts.                                               |
+ |   Shifts the 12 byte quantity pointed to by the first arg (arg)           |
+ |   right by the number of bits specified by the second arg (nr).           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "fpu_emu.h"
+.text
+ENTRY(shr_Xsig)
+        push    %ebp
+        movl    %esp,%ebp
+        pushl   %esi
+        movl    PARAM2,%ecx
+        movl    PARAM1,%esi
+        cmpl    $32,%ecx        /* shrd only works for 0..31 bits */
+        jnc     L_more_than_31
+/* less than 32 bits */
+        pushl   %ebx
+        movl    (%esi),%eax     /* lsl */
+        movl    4(%esi),%ebx    /* midl */
+        movl    8(%esi),%edx    /* msl */
+        shrd    %cl,%ebx,%eax
+        shrd    %cl,%edx,%ebx
+        shr     %cl,%edx
+        movl    %eax,(%esi)
+        movl    %ebx,4(%esi)
+        movl    %edx,8(%esi)
+        popl    %ebx
+        popl    %esi
+        leave
+        ret
+L_more_than_31:
+        cmpl    $64,%ecx
+        jnc     L_more_than_63
+        subb    $32,%cl
+        movl    4(%esi),%eax    /* midl */
+        movl    8(%esi),%edx    /* msl */
+        shrd    %cl,%edx,%eax
+        shr     %cl,%edx
+        movl    %eax,(%esi)
+        movl    %edx,4(%esi)
+        movl    $0,8(%esi)
+        popl    %esi
+        leave
+        ret
+L_more_than_63:
+        cmpl    $96,%ecx
+        jnc     L_more_than_95
+        subb    $64,%cl
+        movl    8(%esi),%eax    /* msl */
+        shr     %cl,%eax
+        xorl    %edx,%edx
+        movl    %eax,(%esi)
+        movl    %edx,4(%esi)
+        movl    %edx,8(%esi)
+        popl    %esi
+        leave
+        ret
+L_more_than_95:
+        xorl    %eax,%eax
+        movl    %eax,(%esi)
+        movl    %eax,4(%esi)
+        movl    %eax,8(%esi)
+        popl    %esi
+        leave
+        ret
diff --git a/arch/x86/math-emu/status_w.h b/arch/x86/math-emu/status_w.h
new file mode 100644
index 000000000000..59e73302aa60
--- /dev/null
+++ b/arch/x86/math-emu/status_w.h
@@ -0,0 +1,67 @@
+/*---------------------------------------------------------------------------+
+ |  status_w.h                                                               |
+ |                                                                           |
+ | Copyright (C) 1992,1993                                                   |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail   billm@vaxc.cc.monash.edu.au    |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#ifndef _STATUS_H_
+#define _STATUS_H_
+#include "fpu_emu.h"    /* for definition of PECULIAR_486 */
+#ifdef __ASSEMBLY__
+#define Const__(x)      $##x
+#else
+#define Const__(x)      x
+#endif
+#define SW_Backward     Const__(0x8000) /* backward compatibility */
+#define SW_C3           Const__(0x4000) /* condition bit 3 */
+#define SW_Top          Const__(0x3800) /* top of stack */
+#define SW_Top_Shift    Const__(11)     /* shift for top of stack bits */
+#define SW_C2           Const__(0x0400) /* condition bit 2 */
+#define SW_C1           Const__(0x0200) /* condition bit 1 */
+#define SW_C0           Const__(0x0100) /* condition bit 0 */
+#define SW_Summary      Const__(0x0080) /* exception summary */
+#define SW_Stack_Fault  Const__(0x0040) /* stack fault */
+#define SW_Precision    Const__(0x0020) /* loss of precision */
+#define SW_Underflow    Const__(0x0010) /* underflow */
+#define SW_Overflow     Const__(0x0008) /* overflow */
+#define SW_Zero_Div     Const__(0x0004) /* divide by zero */
+#define SW_Denorm_Op    Const__(0x0002) /* denormalized operand */
+#define SW_Invalid      Const__(0x0001) /* invalid operation */
+#define SW_Exc_Mask     Const__(0x27f)  /* Status word exception bit mask */
+#ifndef __ASSEMBLY__
+#define COMP_A_gt_B     1
+#define COMP_A_eq_B     2
+#define COMP_A_lt_B     3
+#define COMP_No_Comp    4
+#define COMP_Denormal   0x20
+#define COMP_NaN        0x40
+#define COMP_SNaN       0x80
+#define status_word() \
+  ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top))
+static inline void setcc(int cc)
+{
+        partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3);
+        partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3);
+}
+#ifdef PECULIAR_486
+   /* Default, this conveys no information, but an 80486 does it. */
+   /* Clear the SW_C1 bit, "other bits undefined". */
+#  define clear_C1()  { partial_status &= ~SW_C1; }
+# else
+#  define clear_C1()
+#endif /* PECULIAR_486 */
+#endif /* __ASSEMBLY__ */
+#endif /* _STATUS_H_ */
diff --git a/arch/x86/math-emu/version.h b/arch/x86/math-emu/version.h
new file mode 100644
index 000000000000..a0d73a1d2b67
--- /dev/null
+++ b/arch/x86/math-emu/version.h
@@ -0,0 +1,12 @@
+/*---------------------------------------------------------------------------+
+ |  version.h                                                                |
+ |                                                                           |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1994,1996,1997,1999                               |
+ |                  W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ |                  E-mail   billm@melbpc.org.au                             |
+ |                                                                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#define FPU_VERSION "wm-FPU-emu version 2.01"
diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S
new file mode 100644
index 000000000000..518428317985
--- /dev/null
+++ b/arch/x86/math-emu/wm_shrx.S
@@ -0,0 +1,204 @@
+        .file   "wm_shrx.S"
+/*---------------------------------------------------------------------------+
+ |  wm_shrx.S                                                                |
+ |                                                                           |
+ | 64 bit right shift functions                                              |
+ |                                                                           |
+ | Copyright (C) 1992,1995                                                   |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail billm@jacobi.maths.monash.edu.au |
+ |                                                                           |
+ | Call from C as:                                                           |
+ |   unsigned FPU_shrx(void *arg1, unsigned arg2)                            |
+ | and                                                                       |
+ |   unsigned FPU_shrxs(void *arg1, unsigned arg2)                           |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+#include "fpu_emu.h"
+.text
+/*---------------------------------------------------------------------------+
+ |   unsigned FPU_shrx(void *arg1, unsigned arg2)                            |
+ |                                                                           |
+ |   Extended shift right function.                                          |
+ |   Fastest for small shifts.                                               |
+ |   Shifts the 64 bit quantity pointed to by the first arg (arg1)           |
+ |   right by the number of bits specified by the second arg (arg2).         |
+ |   Forms a 96 bit quantity from the 64 bit arg and eax:                    |
+ |                [  64 bit arg ][ eax ]                                     |
+ |            shift right  --------->                                        |
+ |   The eax register is initialized to 0 before the shifting.               |
+ |   Results returned in the 64 bit arg and eax.                             |
+ +---------------------------------------------------------------------------*/
+ENTRY(FPU_shrx)
+        push    %ebp
+        movl    %esp,%ebp
+        pushl   %esi
+        movl    PARAM2,%ecx
+        movl    PARAM1,%esi
+        cmpl    $32,%ecx        /* shrd only works for 0..31 bits */
+        jnc     L_more_than_31
+/* less than 32 bits */
+        pushl   %ebx
+        movl    (%esi),%ebx     /* lsl */
+        movl    4(%esi),%edx    /* msl */
+        xorl    %eax,%eax       /* extension */
+        shrd    %cl,%ebx,%eax
+        shrd    %cl,%edx,%ebx
+        shr     %cl,%edx
+        movl    %ebx,(%esi)
+        movl    %edx,4(%esi)
+        popl    %ebx
+        popl    %esi
+        leave
+        ret
+L_more_than_31:
+        cmpl    $64,%ecx
+        jnc     L_more_than_63
+        subb    $32,%cl
+        movl    (%esi),%eax     /* lsl */
+        movl    4(%esi),%edx    /* msl */
+        shrd    %cl,%edx,%eax
+        shr     %cl,%edx
+        movl    %edx,(%esi)
+        movl    $0,4(%esi)
+        popl    %esi
+        leave
+        ret
+L_more_than_63:
+        cmpl    $96,%ecx
+        jnc     L_more_than_95
+        subb    $64,%cl
+        movl    4(%esi),%eax    /* msl */
+        shr     %cl,%eax
+        xorl    %edx,%edx
+        movl    %edx,(%esi)
+        movl    %edx,4(%esi)
+        popl    %esi
+        leave
+        ret
+L_more_than_95:
+        xorl    %eax,%eax
+        movl    %eax,(%esi)
+        movl    %eax,4(%esi)
+        popl    %esi
+        leave
+        ret
+/*---------------------------------------------------------------------------+
+ |   unsigned FPU_shrxs(void *arg1, unsigned arg2)                           |
+ |                                                                           |
+ |   Extended shift right function (optimized for small floating point       |
+ |   integers).                                                              |
+ |   Shifts the 64 bit quantity pointed to by the first arg (arg1)           |
+ |   right by the number of bits specified by the second arg (arg2).         |
+ |   Forms a 96 bit quantity from the 64 bit arg and eax:                    |
+ |                [  64 bit arg ][ eax ]                                     |
+ |            shift right  --------->                                        |
+ |   The eax register is initialized to 0 before the shifting.               |
+ |   The lower 8 bits of eax are lost and replaced by a flag which is        |
+ |   set (to 0x01) if any bit, apart from the first one, is set in the       |
+ |   part which has been shifted out of the arg.                             |
+ |   Results returned in the 64 bit arg and eax.                             |
+ +---------------------------------------------------------------------------*/
+ENTRY(FPU_shrxs)
+        push    %ebp
+        movl    %esp,%ebp
+        pushl   %esi
+        pushl   %ebx
+        movl    PARAM2,%ecx
+        movl    PARAM1,%esi
+        cmpl    $64,%ecx        /* shrd only works for 0..31 bits */
+        jnc     Ls_more_than_63
+        cmpl    $32,%ecx        /* shrd only works for 0..31 bits */
+        jc      Ls_less_than_32
+/* We got here without jumps by assuming that the most common requirement
+   is for small integers */
+/* Shift by [32..63] bits */
+        subb    $32,%cl
+        movl    (%esi),%eax     /* lsl */
+        movl    4(%esi),%edx    /* msl */
+        xorl    %ebx,%ebx
+        shrd    %cl,%eax,%ebx
+        shrd    %cl,%edx,%eax
+        shr     %cl,%edx
+        orl     %ebx,%ebx               /* test these 32 bits */
+        setne   %bl
+        test    $0x7fffffff,%eax        /* and 31 bits here */
+        setne   %bh
+        orw     %bx,%bx                 /* Any of the 63 bit set ? */
+        setne   %al
+        movl    %edx,(%esi)
+        movl    $0,4(%esi)
+        popl    %ebx
+        popl    %esi
+        leave
+        ret
+/* Shift by [0..31] bits */
+Ls_less_than_32:
+        movl    (%esi),%ebx     /* lsl */
+        movl    4(%esi),%edx    /* msl */
+        xorl    %eax,%eax       /* extension */
+        shrd    %cl,%ebx,%eax
+        shrd    %cl,%edx,%ebx
+        shr     %cl,%edx
+        test    $0x7fffffff,%eax        /* only need to look at eax here */
+        setne   %al
+        movl    %ebx,(%esi)
+        movl    %edx,4(%esi)
+        popl    %ebx
+        popl    %esi
+        leave
+        ret
+/* Shift by [64..95] bits */
+Ls_more_than_63:
+        cmpl    $96,%ecx
+        jnc     Ls_more_than_95
+        subb    $64,%cl
+        movl    (%esi),%ebx     /* lsl */
+        movl    4(%esi),%eax    /* msl */
+        xorl    %edx,%edx       /* extension */
+        shrd    %cl,%ebx,%edx
+        shrd    %cl,%eax,%ebx
+        shr     %cl,%eax
+        orl     %ebx,%edx
+        setne   %bl
+        test    $0x7fffffff,%eax        /* only need to look at eax here */
+        setne   %bh
+        orw     %bx,%bx
+        setne   %al
+        xorl    %edx,%edx
+        movl    %edx,(%esi)     /* set to zero */
+        movl    %edx,4(%esi)    /* set to zero */
+        popl    %ebx
+        popl    %esi
+        leave
+        ret
+Ls_more_than_95:
+/* Shift by [96..inf) bits */
+        xorl    %eax,%eax
+        movl    (%esi),%ebx
+        orl     4(%esi),%ebx
+        setne   %al
+        xorl    %ebx,%ebx
+        movl    %ebx,(%esi)
+        movl    %ebx,4(%esi)
+        popl    %ebx
+        popl    %esi
+        leave
+        ret
diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S
new file mode 100644
index 000000000000..d258f59564e1
--- /dev/null
+++ b/arch/x86/math-emu/wm_sqrt.S
@@ -0,0 +1,470 @@
+        .file   "wm_sqrt.S"
+/*---------------------------------------------------------------------------+
+ |  wm_sqrt.S                                                                |
+ |                                                                           |
+ | Fixed point arithmetic square root evaluation.                            |
+ |                                                                           |
+ | Copyright (C) 1992,1993,1995,1997                                         |
+ |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
+ |                       Australia.  E-mail billm@suburbia.net               |
+ |                                                                           |
+ | Call from C as:                                                           |
+ |    int wm_sqrt(FPU_REG *n, unsigned int control_word)                     |
+ |                                                                           |
+ +---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------+
+ |  wm_sqrt(FPU_REG *n, unsigned int control_word)                           |
+ |    returns the square root of n in n.                                     |
+ |                                                                           |
+ |  Use Newton's method to compute the square root of a number, which must   |
+ |  be in the range  [1.0 .. 4.0),  to 64 bits accuracy.                     |
+ |  Does not check the sign or tag of the argument.                          |
+ |  Sets the exponent, but not the sign or tag of the result.                |
+ |                                                                           |
+ |  The guess is kept in %esi:%edi                                           |
+ +---------------------------------------------------------------------------*/
+#include "exception.h"
+#include "fpu_emu.h"
+#ifndef NON_REENTRANT_FPU
+/*      Local storage on the stack: */
+#define FPU_accum_3     -4(%ebp)        /* ms word */
+#define FPU_accum_2     -8(%ebp)
+#define FPU_accum_1     -12(%ebp)
+#define FPU_accum_0     -16(%ebp)
+/*
+ * The de-normalised argument:
+ *                  sq_2                  sq_1              sq_0
+ *        b b b b b b b ... b b b   b b b .... b b b   b 0 0 0 ... 0
+ *           ^ binary point here
+ */
+#define FPU_fsqrt_arg_2 -20(%ebp)       /* ms word */
+#define FPU_fsqrt_arg_1 -24(%ebp)
+#define FPU_fsqrt_arg_0 -28(%ebp)       /* ls word, at most the ms bit is set */
+#else
+/*      Local storage in a static area: */
+.data
+        .align 4,0
+FPU_accum_3:
+        .long   0               /* ms word */
+FPU_accum_2:
+        .long   0
+FPU_accum_1:
+        .long   0
+FPU_accum_0:
+        .long   0
+/* The de-normalised argument:
+                    sq_2                  sq_1              sq_0
+          b b b b b b b ... b b b   b b b .... b b b   b 0 0 0 ... 0
+             ^ binary point here
+ */
+FPU_fsqrt_arg_2:
+        .long   0               /* ms word */
+FPU_fsqrt_arg_1:
+        .long   0
+FPU_fsqrt_arg_0:
+        .long   0               /* ls word, at most the ms bit is set */
+#endif /* NON_REENTRANT_FPU */ 
+.text
+ENTRY(wm_sqrt)
+        pushl   %ebp
+        movl    %esp,%ebp
+#ifndef NON_REENTRANT_FPU
+        subl    $28,%esp
+#endif /* NON_REENTRANT_FPU */
+        pushl   %esi
+        pushl   %edi
+        pushl   %ebx
+        movl    PARAM1,%esi
+        movl    SIGH(%esi),%eax
+        movl    SIGL(%esi),%ecx
+        xorl    %edx,%edx
+/* We use a rough linear estimate for the first guess.. */
+        cmpw    EXP_BIAS,EXP(%esi)
+        jnz     sqrt_arg_ge_2
+        shrl    $1,%eax                 /* arg is in the range  [1.0 .. 2.0) */
+        rcrl    $1,%ecx
+        rcrl    $1,%edx
+sqrt_arg_ge_2:
+/* From here on, n is never accessed directly again until it is
+   replaced by the answer. */
+        movl    %eax,FPU_fsqrt_arg_2            /* ms word of n */
+        movl    %ecx,FPU_fsqrt_arg_1
+        movl    %edx,FPU_fsqrt_arg_0
+/* Make a linear first estimate */
+        shrl    $1,%eax
+        addl    $0x40000000,%eax
+        movl    $0xaaaaaaaa,%ecx
+        mull    %ecx
+        shll    %edx                    /* max result was 7fff... */
+        testl   $0x80000000,%edx        /* but min was 3fff... */
+        jnz     sqrt_prelim_no_adjust
+        movl    $0x80000000,%edx        /* round up */
+sqrt_prelim_no_adjust:
+        movl    %edx,%esi       /* Our first guess */
+/* We have now computed (approx)   (2 + x) / 3, which forms the basis
+   for a few iterations of Newton's method */
+        movl    FPU_fsqrt_arg_2,%ecx    /* ms word */
+/*
+ * From our initial estimate, three iterations are enough to get us
+ * to 30 bits or so. This will then allow two iterations at better
+ * precision to complete the process.
+ */
+/* Compute  (g + n/g)/2  at each iteration (g is the guess). */
+        shrl    %ecx            /* Doing this first will prevent a divide */
+                                /* overflow later. */
+        movl    %ecx,%edx       /* msw of the arg / 2 */
+        divl    %esi            /* current estimate */
+        shrl    %esi            /* divide by 2 */
+        addl    %eax,%esi       /* the new estimate */
+        movl    %ecx,%edx
+        divl    %esi
+        shrl    %esi
+        addl    %eax,%esi
+        movl    %ecx,%edx
+        divl    %esi
+        shrl    %esi
+        addl    %eax,%esi
+/*
+ * Now that an estimate accurate to about 30 bits has been obtained (in %esi),
+ * we improve it to 60 bits or so.
+ *
+ * The strategy from now on is to compute new estimates from
+ *      guess := guess + (n - guess^2) / (2 * guess)
+ */
+/* First, find the square of the guess */
+        movl    %esi,%eax
+        mull    %esi
+/* guess^2 now in %edx:%eax */
+        movl    FPU_fsqrt_arg_1,%ecx
+        subl    %ecx,%eax
+        movl    FPU_fsqrt_arg_2,%ecx    /* ms word of normalized n */
+        sbbl    %ecx,%edx
+        jnc     sqrt_stage_2_positive
+/* Subtraction gives a negative result,
+   negate the result before division. */
+        notl    %edx
+        notl    %eax
+        addl    $1,%eax
+        adcl    $0,%edx
+        divl    %esi
+        movl    %eax,%ecx
+        movl    %edx,%eax
+        divl    %esi
+        jmp     sqrt_stage_2_finish
+sqrt_stage_2_positive:
+        divl    %esi
+        movl    %eax,%ecx
+        movl    %edx,%eax
+        divl    %esi
+        notl    %ecx
+        notl    %eax
+        addl    $1,%eax
+        adcl    $0,%ecx
+sqrt_stage_2_finish:
+        sarl    $1,%ecx         /* divide by 2 */
+        rcrl    $1,%eax
+        /* Form the new estimate in %esi:%edi */
+        movl    %eax,%edi
+        addl    %ecx,%esi
+        jnz     sqrt_stage_2_done       /* result should be [1..2) */
+#ifdef PARANOID
+/* It should be possible to get here only if the arg is ffff....ffff */
+        cmp     $0xffffffff,FPU_fsqrt_arg_1
+        jnz     sqrt_stage_2_error
+#endif /* PARANOID */
+/* The best rounded result. */
+        xorl    %eax,%eax
+        decl    %eax
+        movl    %eax,%edi
+        movl    %eax,%esi
+        movl    $0x7fffffff,%eax
+        jmp     sqrt_round_result
+#ifdef PARANOID
+sqrt_stage_2_error:
+        pushl   EX_INTERNAL|0x213
+        call    EXCEPTION
+#endif /* PARANOID */ 
+sqrt_stage_2_done:
+/* Now the square root has been computed to better than 60 bits. */
+/* Find the square of the guess. */
+        movl    %edi,%eax               /* ls word of guess */
+        mull    %edi
+        movl    %edx,FPU_accum_1
+        movl    %esi,%eax
+        mull    %esi
+        movl    %edx,FPU_accum_3
+        movl    %eax,FPU_accum_2
+        movl    %edi,%eax
+        mull    %esi
+        addl    %eax,FPU_accum_1
+        adcl    %edx,FPU_accum_2
+        adcl    $0,FPU_accum_3
+/*      movl    %esi,%eax */
+/*      mull    %edi */
+        addl    %eax,FPU_accum_1
+        adcl    %edx,FPU_accum_2
+        adcl    $0,FPU_accum_3
+/* guess^2 now in FPU_accum_3:FPU_accum_2:FPU_accum_1 */
+        movl    FPU_fsqrt_arg_0,%eax            /* get normalized n */
+        subl    %eax,FPU_accum_1
+        movl    FPU_fsqrt_arg_1,%eax
+        sbbl    %eax,FPU_accum_2
+        movl    FPU_fsqrt_arg_2,%eax            /* ms word of normalized n */
+        sbbl    %eax,FPU_accum_3
+        jnc     sqrt_stage_3_positive
+/* Subtraction gives a negative result,
+   negate the result before division */
+        notl    FPU_accum_1
+        notl    FPU_accum_2
+        notl    FPU_accum_3
+        addl    $1,FPU_accum_1
+        adcl    $0,FPU_accum_2
+#ifdef PARANOID
+        adcl    $0,FPU_accum_3  /* This must be zero */
+        jz      sqrt_stage_3_no_error
+sqrt_stage_3_error:
+        pushl   EX_INTERNAL|0x207
+        call    EXCEPTION
+sqrt_stage_3_no_error:
+#endif /* PARANOID */
+        movl    FPU_accum_2,%edx
+        movl    FPU_accum_1,%eax
+        divl    %esi
+        movl    %eax,%ecx
+        movl    %edx,%eax
+        divl    %esi
+        sarl    $1,%ecx         /* divide by 2 */
+        rcrl    $1,%eax
+        /* prepare to round the result */
+        addl    %ecx,%edi
+        adcl    $0,%esi
+        jmp     sqrt_stage_3_finished
+sqrt_stage_3_positive:
+        movl    FPU_accum_2,%edx
+        movl    FPU_accum_1,%eax
+        divl    %esi
+        movl    %eax,%ecx
+        movl    %edx,%eax
+        divl    %esi
+        sarl    $1,%ecx         /* divide by 2 */
+        rcrl    $1,%eax
+        /* prepare to round the result */
+        notl    %eax            /* Negate the correction term */
+        notl    %ecx
+        addl    $1,%eax
+        adcl    $0,%ecx         /* carry here ==> correction == 0 */
+        adcl    $0xffffffff,%esi
+        addl    %ecx,%edi
+        adcl    $0,%esi
+sqrt_stage_3_finished:
+/*
+ * The result in %esi:%edi:%esi should be good to about 90 bits here,
+ * and the rounding information here does not have sufficient accuracy
+ * in a few rare cases.
+ */
+        cmpl    $0xffffffe0,%eax
+        ja      sqrt_near_exact_x
+        cmpl    $0x00000020,%eax
+        jb      sqrt_near_exact
+        cmpl    $0x7fffffe0,%eax
+        jb      sqrt_round_result
+        cmpl    $0x80000020,%eax
+        jb      sqrt_get_more_precision
+sqrt_round_result:
+/* Set up for rounding operations */
+        movl    %eax,%edx
+        movl    %esi,%eax
+        movl    %edi,%ebx
+        movl    PARAM1,%edi
+        movw    EXP_BIAS,EXP(%edi)      /* Result is in  [1.0 .. 2.0) */
+        jmp     fpu_reg_round
+sqrt_near_exact_x:
+/* First, the estimate must be rounded up. */
+        addl    $1,%edi
+        adcl    $0,%esi
+sqrt_near_exact:
+/*
+ * This is an easy case because x^1/2 is monotonic.
+ * We need just find the square of our estimate, compare it
+ * with the argument, and deduce whether our estimate is
+ * above, below, or exact. We use the fact that the estimate
+ * is known to be accurate to about 90 bits.
+ */
+        movl    %edi,%eax               /* ls word of guess */
+        mull    %edi
+        movl    %edx,%ebx               /* 2nd ls word of square */
+        movl    %eax,%ecx               /* ls word of square */
+        movl    %edi,%eax
+        mull    %esi
+        addl    %eax,%ebx
+        addl    %eax,%ebx
+#ifdef PARANOID
+        cmp     $0xffffffb0,%ebx
+        jb      sqrt_near_exact_ok
+        cmp     $0x00000050,%ebx
+        ja      sqrt_near_exact_ok
+        pushl   EX_INTERNAL|0x214
+        call    EXCEPTION
+sqrt_near_exact_ok:
+#endif /* PARANOID */ 
+        or      %ebx,%ebx
+        js      sqrt_near_exact_small
+        jnz     sqrt_near_exact_large
+        or      %ebx,%edx
+        jnz     sqrt_near_exact_large
+/* Our estimate is exactly the right answer */
+        xorl    %eax,%eax
+        jmp     sqrt_round_result
+sqrt_near_exact_small:
+/* Our estimate is too small */
+        movl    $0x000000ff,%eax
+        jmp     sqrt_round_result
+        
+sqrt_near_exact_large:
+/* Our estimate is too large, we need to decrement it */
+        subl    $1,%edi
+        sbbl    $0,%esi
+        movl    $0xffffff00,%eax
+        jmp     sqrt_round_result
+sqrt_get_more_precision:
+/* This case is almost the same as the above, except we start
+   with an extra bit of precision in the estimate. */
+        stc                     /* The extra bit. */
+        rcll    $1,%edi         /* Shift the estimate left one bit */
+        rcll    $1,%esi
+        movl    %edi,%eax               /* ls word of guess */
+        mull    %edi
+        movl    %edx,%ebx               /* 2nd ls word of square */
+        movl    %eax,%ecx               /* ls word of square */
+        movl    %edi,%eax
+        mull    %esi
+        addl    %eax,%ebx
+        addl    %eax,%ebx
+/* Put our estimate back to its original value */
+        stc                     /* The ms bit. */
+        rcrl    $1,%esi         /* Shift the estimate left one bit */
+        rcrl    $1,%edi
+#ifdef PARANOID
+        cmp     $0xffffff60,%ebx
+        jb      sqrt_more_prec_ok
+        cmp     $0x000000a0,%ebx
+        ja      sqrt_more_prec_ok
+        pushl   EX_INTERNAL|0x215
+        call    EXCEPTION
+sqrt_more_prec_ok:
+#endif /* PARANOID */ 
+        or      %ebx,%ebx
+        js      sqrt_more_prec_small
+        jnz     sqrt_more_prec_large
+        or      %ebx,%ecx
+        jnz     sqrt_more_prec_large
+/* Our estimate is exactly the right answer */
+        movl    $0x80000000,%eax
+        jmp     sqrt_round_result
+sqrt_more_prec_small:
+/* Our estimate is too small */
+        movl    $0x800000ff,%eax
+        jmp     sqrt_round_result
+        
+sqrt_more_prec_large:
+/* Our estimate is too large */
+        movl    $0x7fffff00,%eax
+        jmp     sqrt_round_result
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
new file mode 100644
index 000000000000..983291096848
--- /dev/null
+++ b/arch/x86/mm/Makefile
@@ -0,0 +1,5 @@
+ifeq ($(CONFIG_X86_32),y)
+include ${srctree}/arch/x86/mm/Makefile_32
+else
+include ${srctree}/arch/x86/mm/Makefile_64
+endif
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32
new file mode 100644
index 000000000000..362b4ad082de
--- /dev/null
+++ b/arch/x86/mm/Makefile_32
@@ -0,0 +1,10 @@
+#
+# Makefile for the linux i386-specific parts of the memory manager.
+#
+obj-y   := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o
+obj-$(CONFIG_NUMA) += discontig_32.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_HIGHMEM) += highmem_32.o
+obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64
new file mode 100644
index 000000000000..6bcb47945b87
--- /dev/null
+++ b/arch/x86/mm/Makefile_64
@@ -0,0 +1,10 @@
+#
+# Makefile for the linux x86_64-specific parts of the memory manager.
+#
+obj-y    := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_NUMA) += numa_64.o
+obj-$(CONFIG_K8_NUMA) += k8topology_64.o
+obj-$(CONFIG_ACPI_NUMA) += srat_64.o
diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c
new file mode 100644
index 000000000000..4de95a17a7d4
--- /dev/null
+++ b/arch/x86/mm/boot_ioremap_32.c
@@ -0,0 +1,100 @@
+/*
+ * arch/i386/mm/boot_ioremap.c
+ * 
+ * Re-map functions for early boot-time before paging_init() when the 
+ * boot-time pagetables are still in use
+ *
+ * Written by Dave Hansen <haveblue@us.ibm.com>
+ */
+/*
+ * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
+ * keeps that from happenning.  If anyone has a better way, I'm listening.
+ *
+ * boot_pte_t is defined only if this all works correctly
+ */
+#undef CONFIG_X86_PAE
+#undef CONFIG_PARAVIRT
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <linux/init.h>
+#include <linux/stddef.h>
+/* 
+ * I'm cheating here.  It is known that the two boot PTE pages are 
+ * allocated next to each other.  I'm pretending that they're just
+ * one big array. 
+ */
+#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
+static unsigned long boot_pte_index(unsigned long vaddr) 
+{
+        return __pa(vaddr) >> PAGE_SHIFT;
+}
+static inline boot_pte_t* boot_vaddr_to_pte(void *address)
+{
+        boot_pte_t* boot_pg = (boot_pte_t*)pg0;
+        return &boot_pg[boot_pte_index((unsigned long)address)];
+}
+/*
+ * This is only for a caller who is clever enough to page-align
+ * phys_addr and virtual_source, and who also has a preference
+ * about which virtual address from which to steal ptes
+ */
+static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages, 
+                    void* virtual_source)
+{
+        boot_pte_t* pte;
+        int i;
+        char *vaddr = virtual_source;
+        pte = boot_vaddr_to_pte(virtual_source);
+        for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
+                set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
+                __flush_tlb_one(&vaddr[i*PAGE_SIZE]);
+        }
+}
+/* the virtual space we're going to remap comes from this array */
+#define BOOT_IOREMAP_PAGES 4
+#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
+static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
+                       __attribute__ ((aligned (PAGE_SIZE)));
+/*
+ * This only applies to things which need to ioremap before paging_init()
+ * bt_ioremap() and plain ioremap() are both useless at this point.
+ * 
+ * When used, we're still using the boot-time pagetables, which only
+ * have 2 PTE pages mapping the first 8MB
+ *
+ * There is no unmap.  The boot-time PTE pages aren't used after boot.
+ * If you really want the space back, just remap it yourself.
+ * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
+ */
+__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
+{
+        unsigned long last_addr, offset;
+        unsigned int nrpages;
+        
+        last_addr = phys_addr + size - 1;
+        /* page align the requested address */
+        offset = phys_addr & ~PAGE_MASK;
+        phys_addr &= PAGE_MASK;
+        size = PAGE_ALIGN(last_addr) - phys_addr;
+        
+        nrpages = size >> PAGE_SHIFT;
+        if (nrpages > BOOT_IOREMAP_PAGES)
+                return NULL;
+        
+        __boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
+        return &boot_ioremap_space[offset];
+}
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
new file mode 100644
index 000000000000..860e912a3fbb
--- /dev/null
+++ b/arch/x86/mm/discontig_32.c
@@ -0,0 +1,431 @@
+/*
+ * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
+ * August 2002: added remote node KVA remap - Martin J. Bligh 
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/highmem.h>
+#include <linux/initrd.h>
+#include <linux/nodemask.h>
+#include <linux/module.h>
+#include <linux/kexec.h>
+#include <linux/pfn.h>
+#include <linux/swap.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/mmzone.h>
+#include <bios_ebda.h>
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+bootmem_data_t node0_bdata;
+/*
+ * numa interface - we expect the numa architecture specific code to have
+ *                  populated the following initialisation.
+ *
+ * 1) node_online_map  - the map of all nodes configured (online) in the system
+ * 2) node_start_pfn   - the starting page frame number for a node
+ * 3) node_end_pfn     - the ending page fram number for a node
+ */
+unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
+unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * 4) physnode_map     - the mapping between a pfn and owning node
+ * physnode_map keeps track of the physical memory layout of a generic
+ * numa node on a 256Mb break (each element of the array will
+ * represent 256Mb of memory and will be marked by the node id.  so,
+ * if the first gig is on node 0, and the second gig is on node 1
+ * physnode_map will contain:
+ *
+ *     physnode_map[0-3] = 0;
+ *     physnode_map[4-7] = 1;
+ *     physnode_map[8- ] = -1;
+ */
+s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
+EXPORT_SYMBOL(physnode_map);
+void memory_present(int nid, unsigned long start, unsigned long end)
+{
+        unsigned long pfn;
+        printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n",
+                        nid, start, end);
+        printk(KERN_DEBUG "  Setting physnode_map array to node %d for pfns:\n", nid);
+        printk(KERN_DEBUG "  ");
+        for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
+                physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
+                printk("%ld ", pfn);
+        }
+        printk("\n");
+}
+unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
+                                              unsigned long end_pfn)
+{
+        unsigned long nr_pages = end_pfn - start_pfn;
+        if (!nr_pages)
+                return 0;
+        return (nr_pages + 1) * sizeof(struct page);
+}
+#endif
+extern unsigned long find_max_low_pfn(void);
+extern void add_one_highpage_init(struct page *, int, int);
+extern unsigned long highend_pfn, highstart_pfn;
+#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
+unsigned long node_remap_start_pfn[MAX_NUMNODES];
+unsigned long node_remap_size[MAX_NUMNODES];
+unsigned long node_remap_offset[MAX_NUMNODES];
+void *node_remap_start_vaddr[MAX_NUMNODES];
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+void *node_remap_end_vaddr[MAX_NUMNODES];
+void *node_remap_alloc_vaddr[MAX_NUMNODES];
+static unsigned long kva_start_pfn;
+static unsigned long kva_pages;
+/*
+ * FLAT - support for basic PC memory model with discontig enabled, essentially
+ *        a single node with all available processors in it with a flat
+ *        memory map.
+ */
+int __init get_memcfg_numa_flat(void)
+{
+        printk("NUMA - single node, flat memory mode\n");
+        /* Run the memory configuration and find the top of memory. */
+        find_max_pfn();
+        node_start_pfn[0] = 0;
+        node_end_pfn[0] = max_pfn;
+        memory_present(0, 0, max_pfn);
+        /* Indicate there is one node available. */
+        nodes_clear(node_online_map);
+        node_set_online(0);
+        return 1;
+}
+/*
+ * Find the highest page frame number we have available for the node
+ */
+static void __init find_max_pfn_node(int nid)
+{
+        if (node_end_pfn[nid] > max_pfn)
+                node_end_pfn[nid] = max_pfn;
+        /*
+         * if a user has given mem=XXXX, then we need to make sure 
+         * that the node _starts_ before that, too, not just ends
+         */
+        if (node_start_pfn[nid] > max_pfn)
+                node_start_pfn[nid] = max_pfn;
+        BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
+}
+/* 
+ * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
+ * method.  For node zero take this from the bottom of memory, for
+ * subsequent nodes place them at node_remap_start_vaddr which contains
+ * node local data in physically node local memory.  See setup_memory()
+ * for details.
+ */
+static void __init allocate_pgdat(int nid)
+{
+        if (nid && node_has_online_mem(nid))
+                NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
+        else {
+                NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
+                min_low_pfn += PFN_UP(sizeof(pg_data_t));
+        }
+}
+void *alloc_remap(int nid, unsigned long size)
+{
+        void *allocation = node_remap_alloc_vaddr[nid];
+        size = ALIGN(size, L1_CACHE_BYTES);
+        if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
+                return 0;
+        node_remap_alloc_vaddr[nid] += size;
+        memset(allocation, 0, size);
+        return allocation;
+}
+void __init remap_numa_kva(void)
+{
+        void *vaddr;
+        unsigned long pfn;
+        int node;
+        for_each_online_node(node) {
+                for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
+                        vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+                        set_pmd_pfn((ulong) vaddr, 
+                                node_remap_start_pfn[node] + pfn, 
+                                PAGE_KERNEL_LARGE);
+                }
+        }
+}
+static unsigned long calculate_numa_remap_pages(void)
+{
+        int nid;
+        unsigned long size, reserve_pages = 0;
+        unsigned long pfn;
+        for_each_online_node(nid) {
+                unsigned old_end_pfn = node_end_pfn[nid];
+                /*
+                 * The acpi/srat node info can show hot-add memroy zones
+                 * where memory could be added but not currently present.
+                 */
+                if (node_start_pfn[nid] > max_pfn)
+                        continue;
+                if (node_end_pfn[nid] > max_pfn)
+                        node_end_pfn[nid] = max_pfn;
+                /* ensure the remap includes space for the pgdat. */
+                size = node_remap_size[nid] + sizeof(pg_data_t);
+                /* convert size to large (pmd size) pages, rounding up */
+                size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
+                /* now the roundup is correct, convert to PAGE_SIZE pages */
+                size = size * PTRS_PER_PTE;
+                /*
+                 * Validate the region we are allocating only contains valid
+                 * pages.
+                 */
+                for (pfn = node_end_pfn[nid] - size;
+                     pfn < node_end_pfn[nid]; pfn++)
+                        if (!page_is_ram(pfn))
+                                break;
+                if (pfn != node_end_pfn[nid])
+                        size = 0;
+                printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
+                                size, nid);
+                node_remap_size[nid] = size;
+                node_remap_offset[nid] = reserve_pages;
+                reserve_pages += size;
+                printk("Shrinking node %d from %ld pages to %ld pages\n",
+                        nid, node_end_pfn[nid], node_end_pfn[nid] - size);
+                if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
+                        /*
+                         * Align node_end_pfn[] and node_remap_start_pfn[] to
+                         * pmd boundary. remap_numa_kva will barf otherwise.
+                         */
+                        printk("Shrinking node %d further by %ld pages for proper alignment\n",
+                                nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
+                        size +=  node_end_pfn[nid] & (PTRS_PER_PTE-1);
+                }
+                node_end_pfn[nid] -= size;
+                node_remap_start_pfn[nid] = node_end_pfn[nid];
+                shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
+        }
+        printk("Reserving total of %ld pages for numa KVA remap\n",
+                        reserve_pages);
+        return reserve_pages;
+}
+extern void setup_bootmem_allocator(void);
+unsigned long __init setup_memory(void)
+{
+        int nid;
+        unsigned long system_start_pfn, system_max_low_pfn;
+        /*
+         * When mapping a NUMA machine we allocate the node_mem_map arrays
+         * from node local memory.  They are then mapped directly into KVA
+         * between zone normal and vmalloc space.  Calculate the size of
+         * this space and use it to adjust the boundry between ZONE_NORMAL
+         * and ZONE_HIGHMEM.
+         */
+        find_max_pfn();
+        get_memcfg_numa();
+        kva_pages = calculate_numa_remap_pages();
+        /* partially used pages are not usable - thus round upwards */
+        system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
+        kva_start_pfn = find_max_low_pfn() - kva_pages;
+#ifdef CONFIG_BLK_DEV_INITRD
+        /* Numa kva area is below the initrd */
+        if (LOADER_TYPE && INITRD_START)
+                kva_start_pfn = PFN_DOWN(INITRD_START)  - kva_pages;
+#endif
+        kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1);
+        system_max_low_pfn = max_low_pfn = find_max_low_pfn();
+        printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
+                kva_start_pfn, max_low_pfn);
+        printk("max_pfn = %ld\n", max_pfn);
+#ifdef CONFIG_HIGHMEM
+        highstart_pfn = highend_pfn = max_pfn;
+        if (max_pfn > system_max_low_pfn)
+                highstart_pfn = system_max_low_pfn;
+        printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+               pages_to_mb(highend_pfn - highstart_pfn));
+        num_physpages = highend_pfn;
+        high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+        num_physpages = system_max_low_pfn;
+        high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+        printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+                        pages_to_mb(system_max_low_pfn));
+        printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 
+                        min_low_pfn, max_low_pfn, highstart_pfn);
+        printk("Low memory ends at vaddr %08lx\n",
+                        (ulong) pfn_to_kaddr(max_low_pfn));
+        for_each_online_node(nid) {
+                node_remap_start_vaddr[nid] = pfn_to_kaddr(
+                                kva_start_pfn + node_remap_offset[nid]);
+                /* Init the node remap allocator */
+                node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
+                        (node_remap_size[nid] * PAGE_SIZE);
+                node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
+                        ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+                allocate_pgdat(nid);
+                printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+                        (ulong) node_remap_start_vaddr[nid],
+                        (ulong) pfn_to_kaddr(highstart_pfn
+                           + node_remap_offset[nid] + node_remap_size[nid]));
+        }
+        printk("High memory starts at vaddr %08lx\n",
+                        (ulong) pfn_to_kaddr(highstart_pfn));
+        for_each_online_node(nid)
+                find_max_pfn_node(nid);
+        memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
+        NODE_DATA(0)->bdata = &node0_bdata;
+        setup_bootmem_allocator();
+        return max_low_pfn;
+}
+void __init numa_kva_reserve(void)
+{
+        reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages));
+}
+void __init zone_sizes_init(void)
+{
+        int nid;
+        unsigned long max_zone_pfns[MAX_NR_ZONES];
+        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+        max_zone_pfns[ZONE_DMA] =
+                virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+        max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+        max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+#endif
+        /* If SRAT has not registered memory, register it now */
+        if (find_max_pfn_with_active_regions() == 0) {
+                for_each_online_node(nid) {
+                        if (node_has_online_mem(nid))
+                                add_active_range(nid, node_start_pfn[nid],
+                                                        node_end_pfn[nid]);
+                }
+        }
+        free_area_init_nodes(max_zone_pfns);
+        return;
+}
+void __init set_highmem_pages_init(int bad_ppro) 
+{
+#ifdef CONFIG_HIGHMEM
+        struct zone *zone;
+        struct page *page;
+        for_each_zone(zone) {
+                unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
+                if (!is_highmem(zone))
+                        continue;
+                zone_start_pfn = zone->zone_start_pfn;
+                zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+                printk("Initializing %s for node %d (%08lx:%08lx)\n",
+                                zone->name, zone_to_nid(zone),
+                                zone_start_pfn, zone_end_pfn);
+                for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
+                        if (!pfn_valid(node_pfn))
+                                continue;
+                        page = pfn_to_page(node_pfn);
+                        add_one_highpage_init(page, node_pfn, bad_ppro);
+                }
+        }
+        totalram_pages += totalhigh_pages;
+#endif
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+int paddr_to_nid(u64 addr)
+{
+        int nid;
+        unsigned long pfn = PFN_DOWN(addr);
+        for_each_node(nid)
+                if (node_start_pfn[nid] <= pfn &&
+                    pfn < node_end_pfn[nid])
+                        return nid;
+        return -1;
+}
+/*
+ * This function is used to ask node id BEFORE memmap and mem_section's
+ * initialization (pfn_to_nid() can't be used yet).
+ * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
+ */
+int memory_add_physaddr_to_nid(u64 addr)
+{
+        int nid = paddr_to_nid(addr);
+        return (nid >= 0) ? nid : 0;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c
new file mode 100644
index 000000000000..0ce4f22a2635
--- /dev/null
+++ b/arch/x86/mm/extable_32.c
@@ -0,0 +1,35 @@
+/*
+ * linux/arch/i386/mm/extable.c
+ */
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+int fixup_exception(struct pt_regs *regs)
+{
+        const struct exception_table_entry *fixup;
+#ifdef CONFIG_PNPBIOS
+        if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
+        {
+                extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
+                extern u32 pnp_bios_is_utter_crap;
+                pnp_bios_is_utter_crap = 1;
+                printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
+                __asm__ volatile(
+                        "movl %0, %%esp\n\t"
+                        "jmp *%1\n\t"
+                        : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
+                panic("do_trap: can't hit this");
+        }
+#endif
+        fixup = search_exception_tables(regs->eip);
+        if (fixup) {
+                regs->eip = fixup->fixup;
+                return 1;
+        }
+        return 0;
+}
diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c
new file mode 100644
index 000000000000..79ac6e7100af
--- /dev/null
+++ b/arch/x86/mm/extable_64.c
@@ -0,0 +1,34 @@
+/*
+ * linux/arch/x86_64/mm/extable.c
+ */
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+/* Simple binary search */
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+               const struct exception_table_entry *last,
+               unsigned long value)
+{
+        /* Work around a B stepping K8 bug */
+        if ((value >> 32) == 0)
+                value |= 0xffffffffUL << 32; 
+        while (first <= last) {
+                const struct exception_table_entry *mid;
+                long diff;
+                mid = (last - first) / 2 + first;
+                diff = mid->insn - value;
+                if (diff == 0)
+                        return mid;
+                else if (diff < 0)
+                        first = mid+1;
+                else
+                        last = mid-1;
+        }
+        return NULL;
+}
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
new file mode 100644
index 000000000000..fcb38e7f3543
--- /dev/null
+++ b/arch/x86/mm/fault_32.c
@@ -0,0 +1,657 @@
+/*
+ *  linux/arch/i386/mm/fault.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ */
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>              /* For unblank_screen() */
+#include <linux/highmem.h>
+#include <linux/bootmem.h>              /* for max_low_pfn */
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <asm/system.h>
+#include <asm/desc.h>
+#include <asm/segment.h>
+extern void die(const char *,struct pt_regs *,long);
+static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+int register_page_fault_notifier(struct notifier_block *nb)
+{
+        vmalloc_sync_all();
+        return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_page_fault_notifier);
+int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+        return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
+static inline int notify_page_fault(struct pt_regs *regs, long err)
+{
+        struct die_args args = {
+                .regs = regs,
+                .str = "page fault",
+                .err = err,
+                .trapnr = 14,
+                .signr = SIGSEGV
+        };
+        return atomic_notifier_call_chain(&notify_page_fault_chain,
+                                          DIE_PAGE_FAULT, &args);
+}
+/*
+ * Return EIP plus the CS segment base.  The segment limit is also
+ * adjusted, clamped to the kernel/user address space (whichever is
+ * appropriate), and returned in *eip_limit.
+ *
+ * The segment is checked, because it might have been changed by another
+ * task between the original faulting instruction and here.
+ *
+ * If CS is no longer a valid code segment, or if EIP is beyond the
+ * limit, or if it is a kernel address when CS is not a kernel segment,
+ * then the returned value will be greater than *eip_limit.
+ * 
+ * This is slow, but is very rarely executed.
+ */
+static inline unsigned long get_segment_eip(struct pt_regs *regs,
+                                            unsigned long *eip_limit)
+{
+        unsigned long eip = regs->eip;
+        unsigned seg = regs->xcs & 0xffff;
+        u32 seg_ar, seg_limit, base, *desc;
+        /* Unlikely, but must come before segment checks. */
+        if (unlikely(regs->eflags & VM_MASK)) {
+                base = seg << 4;
+                *eip_limit = base + 0xffff;
+                return base + (eip & 0xffff);
+        }
+        /* The standard kernel/user address space limit. */
+        *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
+        
+        /* By far the most common cases. */
+        if (likely(SEGMENT_IS_FLAT_CODE(seg)))
+                return eip;
+        /* Check the segment exists, is within the current LDT/GDT size,
+           that kernel/user (ring 0..3) has the appropriate privilege,
+           that it's a code segment, and get the limit. */
+        __asm__ ("larl %3,%0; lsll %3,%1"
+                 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
+        if ((~seg_ar & 0x9800) || eip > seg_limit) {
+                *eip_limit = 0;
+                return 1;        /* So that returned eip > *eip_limit. */
+        }
+        /* Get the GDT/LDT descriptor base. 
+           When you look for races in this code remember that
+           LDT and other horrors are only used in user space. */
+        if (seg & (1<<2)) {
+                /* Must lock the LDT while reading it. */
+                down(&current->mm->context.sem);
+                desc = current->mm->context.ldt;
+                desc = (void *)desc + (seg & ~7);
+        } else {
+                /* Must disable preemption while reading the GDT. */
+                desc = (u32 *)get_cpu_gdt_table(get_cpu());
+                desc = (void *)desc + (seg & ~7);
+        }
+        /* Decode the code segment base from the descriptor */
+        base = get_desc_base((unsigned long *)desc);
+        if (seg & (1<<2)) { 
+                up(&current->mm->context.sem);
+        } else
+                put_cpu();
+        /* Adjust EIP and segment limit, and clamp at the kernel limit.
+           It's legitimate for segments to wrap at 0xffffffff. */
+        seg_limit += base;
+        if (seg_limit < *eip_limit && seg_limit >= base)
+                *eip_limit = seg_limit;
+        return eip + base;
+}
+/* 
+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ */
+static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
+{ 
+        unsigned long limit;
+        unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
+        int scan_more = 1;
+        int prefetch = 0; 
+        int i;
+        for (i = 0; scan_more && i < 15; i++) { 
+                unsigned char opcode;
+                unsigned char instr_hi;
+                unsigned char instr_lo;
+                if (instr > (unsigned char *)limit)
+                        break;
+                if (probe_kernel_address(instr, opcode))
+                        break; 
+                instr_hi = opcode & 0xf0; 
+                instr_lo = opcode & 0x0f; 
+                instr++;
+                switch (instr_hi) { 
+                case 0x20:
+                case 0x30:
+                        /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
+                        scan_more = ((instr_lo & 7) == 0x6);
+                        break;
+                        
+                case 0x60:
+                        /* 0x64 thru 0x67 are valid prefixes in all modes. */
+                        scan_more = (instr_lo & 0xC) == 0x4;
+                        break;          
+                case 0xF0:
+                        /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
+                        scan_more = !instr_lo || (instr_lo>>1) == 1;
+                        break;                  
+                case 0x00:
+                        /* Prefetch instruction is 0x0F0D or 0x0F18 */
+                        scan_more = 0;
+                        if (instr > (unsigned char *)limit)
+                                break;
+                        if (probe_kernel_address(instr, opcode))
+                                break;
+                        prefetch = (instr_lo == 0xF) &&
+                                (opcode == 0x0D || opcode == 0x18);
+                        break;                  
+                default:
+                        scan_more = 0;
+                        break;
+                } 
+        }
+        return prefetch;
+}
+static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
+                              unsigned long error_code)
+{
+        if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+                     boot_cpu_data.x86 >= 6)) {
+                /* Catch an obscure case of prefetch inside an NX page. */
+                if (nx_enabled && (error_code & 16))
+                        return 0;
+                return __is_prefetch(regs, addr);
+        }
+        return 0;
+} 
+static noinline void force_sig_info_fault(int si_signo, int si_code,
+        unsigned long address, struct task_struct *tsk)
+{
+        siginfo_t info;
+        info.si_signo = si_signo;
+        info.si_errno = 0;
+        info.si_code = si_code;
+        info.si_addr = (void __user *)address;
+        force_sig_info(si_signo, &info, tsk);
+}
+fastcall void do_invalid_op(struct pt_regs *, unsigned long);
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+        unsigned index = pgd_index(address);
+        pgd_t *pgd_k;
+        pud_t *pud, *pud_k;
+        pmd_t *pmd, *pmd_k;
+        pgd += index;
+        pgd_k = init_mm.pgd + index;
+        if (!pgd_present(*pgd_k))
+                return NULL;
+        /*
+         * set_pgd(pgd, *pgd_k); here would be useless on PAE
+         * and redundant with the set_pmd() on non-PAE. As would
+         * set_pud.
+         */
+        pud = pud_offset(pgd, address);
+        pud_k = pud_offset(pgd_k, address);
+        if (!pud_present(*pud_k))
+                return NULL;
+        pmd = pmd_offset(pud, address);
+        pmd_k = pmd_offset(pud_k, address);
+        if (!pmd_present(*pmd_k))
+                return NULL;
+        if (!pmd_present(*pmd)) {
+                set_pmd(pmd, *pmd_k);
+                arch_flush_lazy_mmu_mode();
+        } else
+                BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+        return pmd_k;
+}
+/*
+ * Handle a fault on the vmalloc or module mapping area
+ *
+ * This assumes no large pages in there.
+ */
+static inline int vmalloc_fault(unsigned long address)
+{
+        unsigned long pgd_paddr;
+        pmd_t *pmd_k;
+        pte_t *pte_k;
+        /*
+         * Synchronize this task's top level page-table
+         * with the 'reference' page table.
+         *
+         * Do _not_ use "current" here. We might be inside
+         * an interrupt in the middle of a task switch..
+         */
+        pgd_paddr = read_cr3();
+        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+        if (!pmd_k)
+                return -1;
+        pte_k = pte_offset_kernel(pmd_k, address);
+        if (!pte_present(*pte_k))
+                return -1;
+        return 0;
+}
+int show_unhandled_signals = 1;
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ *
+ * error_code:
+ *      bit 0 == 0 means no page found, 1 means protection fault
+ *      bit 1 == 0 means read, 1 means write
+ *      bit 2 == 0 means kernel, 1 means user-mode
+ *      bit 3 == 1 means use of reserved bit detected
+ *      bit 4 == 1 means fault was an instruction fetch
+ */
+fastcall void __kprobes do_page_fault(struct pt_regs *regs,
+                                      unsigned long error_code)
+{
+        struct task_struct *tsk;
+        struct mm_struct *mm;
+        struct vm_area_struct * vma;
+        unsigned long address;
+        int write, si_code;
+        int fault;
+        /* get the address */
+        address = read_cr2();
+        tsk = current;
+        si_code = SEGV_MAPERR;
+        /*
+         * We fault-in kernel-space virtual memory on-demand. The
+         * 'reference' page table is init_mm.pgd.
+         *
+         * NOTE! We MUST NOT take any locks for this case. We may
+         * be in an interrupt or a critical region, and should
+         * only copy the information from the master page table,
+         * nothing more.
+         *
+         * This verifies that the fault happens in kernel space
+         * (error_code & 4) == 0, and that the fault was not a
+         * protection error (error_code & 9) == 0.
+         */
+        if (unlikely(address >= TASK_SIZE)) {
+                if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
+                        return;
+                if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
+                        return;
+                /*
+                 * Don't take the mm semaphore here. If we fixup a prefetch
+                 * fault we could otherwise deadlock.
+                 */
+                goto bad_area_nosemaphore;
+        }
+        if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
+                return;
+        /* It's safe to allow irq's after cr2 has been saved and the vmalloc
+           fault has been handled. */
+        if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
+                local_irq_enable();
+        mm = tsk->mm;
+        /*
+         * If we're in an interrupt, have no user context or are running in an
+         * atomic region then we must not take the fault..
+         */
+        if (in_atomic() || !mm)
+                goto bad_area_nosemaphore;
+        /* When running in the kernel we expect faults to occur only to
+         * addresses in user space.  All other faults represent errors in the
+         * kernel and should generate an OOPS.  Unfortunatly, in the case of an
+         * erroneous fault occurring in a code path which already holds mmap_sem
+         * we will deadlock attempting to validate the fault against the
+         * address space.  Luckily the kernel only validly references user
+         * space from well defined areas of code, which are listed in the
+         * exceptions table.
+         *
+         * As the vast majority of faults will be valid we will only perform
+         * the source reference check when there is a possibilty of a deadlock.
+         * Attempt to lock the address space, if we cannot we then validate the
+         * source.  If this is invalid we can skip the address space check,
+         * thus avoiding the deadlock.
+         */
+        if (!down_read_trylock(&mm->mmap_sem)) {
+                if ((error_code & 4) == 0 &&
+                    !search_exception_tables(regs->eip))
+                        goto bad_area_nosemaphore;
+                down_read(&mm->mmap_sem);
+        }
+        vma = find_vma(mm, address);
+        if (!vma)
+                goto bad_area;
+        if (vma->vm_start <= address)
+                goto good_area;
+        if (!(vma->vm_flags & VM_GROWSDOWN))
+                goto bad_area;
+        if (error_code & 4) {
+                /*
+                 * Accessing the stack below %esp is always a bug.
+                 * The large cushion allows instructions like enter
+                 * and pusha to work.  ("enter $65535,$31" pushes
+                 * 32 pointers and then decrements %esp by 65535.)
+                 */
+                if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
+                        goto bad_area;
+        }
+        if (expand_stack(vma, address))
+                goto bad_area;
+/*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+        si_code = SEGV_ACCERR;
+        write = 0;
+        switch (error_code & 3) {
+                default:        /* 3: write, present */
+                                /* fall through */
+                case 2:         /* write, not present */
+                        if (!(vma->vm_flags & VM_WRITE))
+                                goto bad_area;
+                        write++;
+                        break;
+                case 1:         /* read, present */
+                        goto bad_area;
+                case 0:         /* read, not present */
+                        if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+                                goto bad_area;
+        }
+ survive:
+        /*
+         * If for any reason at all we couldn't handle the fault,
+         * make sure we exit gracefully rather than endlessly redo
+         * the fault.
+         */
+        fault = handle_mm_fault(mm, vma, address, write);
+        if (unlikely(fault & VM_FAULT_ERROR)) {
+                if (fault & VM_FAULT_OOM)
+                        goto out_of_memory;
+                else if (fault & VM_FAULT_SIGBUS)
+                        goto do_sigbus;
+                BUG();
+        }
+        if (fault & VM_FAULT_MAJOR)
+                tsk->maj_flt++;
+        else
+                tsk->min_flt++;
+        /*
+         * Did it hit the DOS screen memory VA from vm86 mode?
+         */
+        if (regs->eflags & VM_MASK) {
+                unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
+                if (bit < 32)
+                        tsk->thread.screen_bitmap |= 1 << bit;
+        }
+        up_read(&mm->mmap_sem);
+        return;
+/*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+bad_area:
+        up_read(&mm->mmap_sem);
+bad_area_nosemaphore:
+        /* User mode accesses just cause a SIGSEGV */
+        if (error_code & 4) {
+                /*
+                 * It's possible to have interrupts off here.
+                 */
+                local_irq_enable();
+                /* 
+                 * Valid to do another page fault here because this one came 
+                 * from user space.
+                 */
+                if (is_prefetch(regs, address, error_code))
+                        return;
+                if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+                    printk_ratelimit()) {
+                        printk("%s%s[%d]: segfault at %08lx eip %08lx "
+                            "esp %08lx error %lx\n",
+                            tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
+                            tsk->comm, tsk->pid, address, regs->eip,
+                            regs->esp, error_code);
+                }
+                tsk->thread.cr2 = address;
+                /* Kernel addresses are always protection faults */
+                tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+                tsk->thread.trap_no = 14;
+                force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+                return;
+        }
+#ifdef CONFIG_X86_F00F_BUG
+        /*
+         * Pentium F0 0F C7 C8 bug workaround.
+         */
+        if (boot_cpu_data.f00f_bug) {
+                unsigned long nr;
+                
+                nr = (address - idt_descr.address) >> 3;
+                if (nr == 6) {
+                        do_invalid_op(regs, 0);
+                        return;
+                }
+        }
+#endif
+no_context:
+        /* Are we prepared to handle this kernel fault?  */
+        if (fixup_exception(regs))
+                return;
+        /* 
+         * Valid to do another page fault here, because if this fault
+         * had been triggered by is_prefetch fixup_exception would have 
+         * handled it.
+         */
+        if (is_prefetch(regs, address, error_code))
+                return;
+/*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+        bust_spinlocks(1);
+        if (oops_may_print()) {
+                __typeof__(pte_val(__pte(0))) page;
+#ifdef CONFIG_X86_PAE
+                if (error_code & 16) {
+                        pte_t *pte = lookup_address(address);
+                        if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
+                                printk(KERN_CRIT "kernel tried to execute "
+                                        "NX-protected page - exploit attempt? "
+                                        "(uid: %d)\n", current->uid);
+                }
+#endif
+                if (address < PAGE_SIZE)
+                        printk(KERN_ALERT "BUG: unable to handle kernel NULL "
+                                        "pointer dereference");
+                else
+                        printk(KERN_ALERT "BUG: unable to handle kernel paging"
+                                        " request");
+                printk(" at virtual address %08lx\n",address);
+                printk(KERN_ALERT " printing eip:\n");
+                printk("%08lx\n", regs->eip);
+                page = read_cr3();
+                page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
+#ifdef CONFIG_X86_PAE
+                printk(KERN_ALERT "*pdpt = %016Lx\n", page);
+                if ((page >> PAGE_SHIFT) < max_low_pfn
+                    && page & _PAGE_PRESENT) {
+                        page &= PAGE_MASK;
+                        page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
+                                                                 & (PTRS_PER_PMD - 1)];
+                        printk(KERN_ALERT "*pde = %016Lx\n", page);
+                        page &= ~_PAGE_NX;
+                }
+#else
+                printk(KERN_ALERT "*pde = %08lx\n", page);
+#endif
+                /*
+                 * We must not directly access the pte in the highpte
+                 * case if the page table is located in highmem.
+                 * And let's rather not kmap-atomic the pte, just in case
+                 * it's allocated already.
+                 */
+                if ((page >> PAGE_SHIFT) < max_low_pfn
+                    && (page & _PAGE_PRESENT)) {
+                        page &= PAGE_MASK;
+                        page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
+                                                                 & (PTRS_PER_PTE - 1)];
+                        printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page);
+                }
+        }
+        tsk->thread.cr2 = address;
+        tsk->thread.trap_no = 14;
+        tsk->thread.error_code = error_code;
+        die("Oops", regs, error_code);
+        bust_spinlocks(0);
+        do_exit(SIGKILL);
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+        up_read(&mm->mmap_sem);
+        if (is_init(tsk)) {
+                yield();
+                down_read(&mm->mmap_sem);
+                goto survive;
+        }
+        printk("VM: killing process %s\n", tsk->comm);
+        if (error_code & 4)
+                do_exit(SIGKILL);
+        goto no_context;
+do_sigbus:
+        up_read(&mm->mmap_sem);
+        /* Kernel mode? Handle exceptions or die */
+        if (!(error_code & 4))
+                goto no_context;
+        /* User space => ok to do another page fault */
+        if (is_prefetch(regs, address, error_code))
+                return;
+        tsk->thread.cr2 = address;
+        tsk->thread.error_code = error_code;
+        tsk->thread.trap_no = 14;
+        force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+void vmalloc_sync_all(void)
+{
+        /*
+         * Note that races in the updates of insync and start aren't
+         * problematic: insync can only get set bits added, and updates to
+         * start are only improving performance (without affecting correctness
+         * if undone).
+         */
+        static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+        static unsigned long start = TASK_SIZE;
+        unsigned long address;
+        if (SHARED_KERNEL_PMD)
+                return;
+        BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
+        for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
+                if (!test_bit(pgd_index(address), insync)) {
+                        unsigned long flags;
+                        struct page *page;
+                        spin_lock_irqsave(&pgd_lock, flags);
+                        for (page = pgd_list; page; page =
+                                        (struct page *)page->index)
+                                if (!vmalloc_sync_one(page_address(page),
+                                                                address)) {
+                                        BUG_ON(page != pgd_list);
+                                        break;
+                                }
+                        spin_unlock_irqrestore(&pgd_lock, flags);
+                        if (!page)
+                                set_bit(pgd_index(address), insync);
+                }
+                if (address == start && test_bit(pgd_index(address), insync))
+                        start = address + PGDIR_SIZE;
+        }
+}
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
new file mode 100644
index 000000000000..54816adb8e93
--- /dev/null
+++ b/arch/x86/mm/fault_64.c
@@ -0,0 +1,636 @@
+/*
+ *  linux/arch/x86-64/mm/fault.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+ */
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>              /* For unblank_screen() */
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <asm/system.h>
+#include <asm/pgalloc.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm-generic/sections.h>
+/* Page fault error code bits */
+#define PF_PROT (1<<0)          /* or no page found */
+#define PF_WRITE        (1<<1)
+#define PF_USER (1<<2)
+#define PF_RSVD (1<<3)
+#define PF_INSTR        (1<<4)
+static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+/* Hook to register for page fault notifications */
+int register_page_fault_notifier(struct notifier_block *nb)
+{
+        vmalloc_sync_all();
+        return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_page_fault_notifier);
+int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+        return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
+static inline int notify_page_fault(struct pt_regs *regs, long err)
+{
+        struct die_args args = {
+                .regs = regs,
+                .str = "page fault",
+                .err = err,
+                .trapnr = 14,
+                .signr = SIGSEGV
+        };
+        return atomic_notifier_call_chain(&notify_page_fault_chain,
+                                          DIE_PAGE_FAULT, &args);
+}
+/* Sometimes the CPU reports invalid exceptions on prefetch.
+   Check that here and ignore.
+   Opcode checker based on code by Richard Brunner */
+static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
+                                unsigned long error_code)
+{ 
+        unsigned char *instr;
+        int scan_more = 1;
+        int prefetch = 0; 
+        unsigned char *max_instr;
+        /* If it was a exec fault ignore */
+        if (error_code & PF_INSTR)
+                return 0;
+        
+        instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
+        max_instr = instr + 15;
+        if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+                return 0;
+        while (scan_more && instr < max_instr) { 
+                unsigned char opcode;
+                unsigned char instr_hi;
+                unsigned char instr_lo;
+                if (probe_kernel_address(instr, opcode))
+                        break; 
+                instr_hi = opcode & 0xf0; 
+                instr_lo = opcode & 0x0f; 
+                instr++;
+                switch (instr_hi) { 
+                case 0x20:
+                case 0x30:
+                        /* Values 0x26,0x2E,0x36,0x3E are valid x86
+                           prefixes.  In long mode, the CPU will signal
+                           invalid opcode if some of these prefixes are
+                           present so we will never get here anyway */
+                        scan_more = ((instr_lo & 7) == 0x6);
+                        break;
+                        
+                case 0x40:
+                        /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
+                           Need to figure out under what instruction mode the
+                           instruction was issued ... */
+                        /* Could check the LDT for lm, but for now it's good
+                           enough to assume that long mode only uses well known
+                           segments or kernel. */
+                        scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
+                        break;
+                        
+                case 0x60:
+                        /* 0x64 thru 0x67 are valid prefixes in all modes. */
+                        scan_more = (instr_lo & 0xC) == 0x4;
+                        break;          
+                case 0xF0:
+                        /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
+                        scan_more = !instr_lo || (instr_lo>>1) == 1;
+                        break;                  
+                case 0x00:
+                        /* Prefetch instruction is 0x0F0D or 0x0F18 */
+                        scan_more = 0;
+                        if (probe_kernel_address(instr, opcode))
+                                break;
+                        prefetch = (instr_lo == 0xF) &&
+                                (opcode == 0x0D || opcode == 0x18);
+                        break;                  
+                default:
+                        scan_more = 0;
+                        break;
+                } 
+        }
+        return prefetch;
+}
+static int bad_address(void *p) 
+{ 
+        unsigned long dummy;
+        return probe_kernel_address((unsigned long *)p, dummy);
+} 
+void dump_pagetable(unsigned long address)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        pgd = (pgd_t *)read_cr3();
+        pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 
+        pgd += pgd_index(address);
+        if (bad_address(pgd)) goto bad;
+        printk("PGD %lx ", pgd_val(*pgd));
+        if (!pgd_present(*pgd)) goto ret; 
+        pud = pud_offset(pgd, address);
+        if (bad_address(pud)) goto bad;
+        printk("PUD %lx ", pud_val(*pud));
+        if (!pud_present(*pud)) goto ret;
+        pmd = pmd_offset(pud, address);
+        if (bad_address(pmd)) goto bad;
+        printk("PMD %lx ", pmd_val(*pmd));
+        if (!pmd_present(*pmd)) goto ret;        
+        pte = pte_offset_kernel(pmd, address);
+        if (bad_address(pte)) goto bad;
+        printk("PTE %lx", pte_val(*pte)); 
+ret:
+        printk("\n");
+        return;
+bad:
+        printk("BAD\n");
+}
+static const char errata93_warning[] = 
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+/* Workaround for K8 erratum #93 & buggy BIOS.
+   BIOS SMM functions are required to use a specific workaround
+   to avoid corruption of the 64bit RIP register on C stepping K8. 
+   A lot of BIOS that didn't get tested properly miss this. 
+   The OS sees this as a page fault with the upper 32bits of RIP cleared.
+   Try to work around it here.
+   Note we only handle faults in kernel here. */
+static int is_errata93(struct pt_regs *regs, unsigned long address) 
+{
+        static int warned;
+        if (address != regs->rip)
+                return 0;
+        if ((address >> 32) != 0) 
+                return 0;
+        address |= 0xffffffffUL << 32;
+        if ((address >= (u64)_stext && address <= (u64)_etext) || 
+            (address >= MODULES_VADDR && address <= MODULES_END)) { 
+                if (!warned) {
+                        printk(errata93_warning);               
+                        warned = 1;
+                }
+                regs->rip = address;
+                return 1;
+        }
+        return 0;
+} 
+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
+                                 unsigned long error_code)
+{
+        unsigned long flags = oops_begin();
+        struct task_struct *tsk;
+        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+               current->comm, address);
+        dump_pagetable(address);
+        tsk = current;
+        tsk->thread.cr2 = address;
+        tsk->thread.trap_no = 14;
+        tsk->thread.error_code = error_code;
+        __die("Bad pagetable", regs, error_code);
+        oops_end(flags);
+        do_exit(SIGKILL);
+}
+/*
+ * Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static int vmalloc_fault(unsigned long address)
+{
+        pgd_t *pgd, *pgd_ref;
+        pud_t *pud, *pud_ref;
+        pmd_t *pmd, *pmd_ref;
+        pte_t *pte, *pte_ref;
+        /* Copy kernel mappings over when needed. This can also
+           happen within a race in page table update. In the later
+           case just flush. */
+        pgd = pgd_offset(current->mm ?: &init_mm, address);
+        pgd_ref = pgd_offset_k(address);
+        if (pgd_none(*pgd_ref))
+                return -1;
+        if (pgd_none(*pgd))
+                set_pgd(pgd, *pgd_ref);
+        else
+                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+        /* Below here mismatches are bugs because these lower tables
+           are shared */
+        pud = pud_offset(pgd, address);
+        pud_ref = pud_offset(pgd_ref, address);
+        if (pud_none(*pud_ref))
+                return -1;
+        if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+                BUG();
+        pmd = pmd_offset(pud, address);
+        pmd_ref = pmd_offset(pud_ref, address);
+        if (pmd_none(*pmd_ref))
+                return -1;
+        if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+                BUG();
+        pte_ref = pte_offset_kernel(pmd_ref, address);
+        if (!pte_present(*pte_ref))
+                return -1;
+        pte = pte_offset_kernel(pmd, address);
+        /* Don't use pte_page here, because the mappings can point
+           outside mem_map, and the NUMA hash lookup cannot handle
+           that. */
+        if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+                BUG();
+        return 0;
+}
+static int page_fault_trace;
+int show_unhandled_signals = 1;
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
+                                        unsigned long error_code)
+{
+        struct task_struct *tsk;
+        struct mm_struct *mm;
+        struct vm_area_struct * vma;
+        unsigned long address;
+        const struct exception_table_entry *fixup;
+        int write, fault;
+        unsigned long flags;
+        siginfo_t info;
+        tsk = current;
+        mm = tsk->mm;
+        prefetchw(&mm->mmap_sem);
+        /* get the address */
+        address = read_cr2();
+        info.si_code = SEGV_MAPERR;
+        /*
+         * We fault-in kernel-space virtual memory on-demand. The
+         * 'reference' page table is init_mm.pgd.
+         *
+         * NOTE! We MUST NOT take any locks for this case. We may
+         * be in an interrupt or a critical region, and should
+         * only copy the information from the master page table,
+         * nothing more.
+         *
+         * This verifies that the fault happens in kernel space
+         * (error_code & 4) == 0, and that the fault was not a
+         * protection error (error_code & 9) == 0.
+         */
+        if (unlikely(address >= TASK_SIZE64)) {
+                /*
+                 * Don't check for the module range here: its PML4
+                 * is always initialized because it's shared with the main
+                 * kernel text. Only vmalloc may need PML4 syncups.
+                 */
+                if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+                      ((address >= VMALLOC_START && address < VMALLOC_END))) {
+                        if (vmalloc_fault(address) >= 0)
+                                return;
+                }
+                if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
+                        return;
+                /*
+                 * Don't take the mm semaphore here. If we fixup a prefetch
+                 * fault we could otherwise deadlock.
+                 */
+                goto bad_area_nosemaphore;
+        }
+        if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
+                return;
+        if (likely(regs->eflags & X86_EFLAGS_IF))
+                local_irq_enable();
+        if (unlikely(page_fault_trace))
+                printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
+                       regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 
+        if (unlikely(error_code & PF_RSVD))
+                pgtable_bad(address, regs, error_code);
+        /*
+         * If we're in an interrupt or have no user
+         * context, we must not take the fault..
+         */
+        if (unlikely(in_atomic() || !mm))
+                goto bad_area_nosemaphore;
+        /*
+         * User-mode registers count as a user access even for any
+         * potential system fault or CPU buglet.
+         */
+        if (user_mode_vm(regs))
+                error_code |= PF_USER;
+ again:
+        /* When running in the kernel we expect faults to occur only to
+         * addresses in user space.  All other faults represent errors in the
+         * kernel and should generate an OOPS.  Unfortunatly, in the case of an
+         * erroneous fault occurring in a code path which already holds mmap_sem
+         * we will deadlock attempting to validate the fault against the
+         * address space.  Luckily the kernel only validly references user
+         * space from well defined areas of code, which are listed in the
+         * exceptions table.
+         *
+         * As the vast majority of faults will be valid we will only perform
+         * the source reference check when there is a possibilty of a deadlock.
+         * Attempt to lock the address space, if we cannot we then validate the
+         * source.  If this is invalid we can skip the address space check,
+         * thus avoiding the deadlock.
+         */
+        if (!down_read_trylock(&mm->mmap_sem)) {
+                if ((error_code & PF_USER) == 0 &&
+                    !search_exception_tables(regs->rip))
+                        goto bad_area_nosemaphore;
+                down_read(&mm->mmap_sem);
+        }
+        vma = find_vma(mm, address);
+        if (!vma)
+                goto bad_area;
+        if (likely(vma->vm_start <= address))
+                goto good_area;
+        if (!(vma->vm_flags & VM_GROWSDOWN))
+                goto bad_area;
+        if (error_code & 4) {
+                /* Allow userspace just enough access below the stack pointer
+                 * to let the 'enter' instruction work.
+                 */
+                if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
+                        goto bad_area;
+        }
+        if (expand_stack(vma, address))
+                goto bad_area;
+/*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+        info.si_code = SEGV_ACCERR;
+        write = 0;
+        switch (error_code & (PF_PROT|PF_WRITE)) {
+                default:        /* 3: write, present */
+                        /* fall through */
+                case PF_WRITE:          /* write, not present */
+                        if (!(vma->vm_flags & VM_WRITE))
+                                goto bad_area;
+                        write++;
+                        break;
+                case PF_PROT:           /* read, present */
+                        goto bad_area;
+                case 0:                 /* read, not present */
+                        if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+                                goto bad_area;
+        }
+        /*
+         * If for any reason at all we couldn't handle the fault,
+         * make sure we exit gracefully rather than endlessly redo
+         * the fault.
+         */
+        fault = handle_mm_fault(mm, vma, address, write);
+        if (unlikely(fault & VM_FAULT_ERROR)) {
+                if (fault & VM_FAULT_OOM)
+                        goto out_of_memory;
+                else if (fault & VM_FAULT_SIGBUS)
+                        goto do_sigbus;
+                BUG();
+        }
+        if (fault & VM_FAULT_MAJOR)
+                tsk->maj_flt++;
+        else
+                tsk->min_flt++;
+        up_read(&mm->mmap_sem);
+        return;
+/*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+bad_area:
+        up_read(&mm->mmap_sem);
+bad_area_nosemaphore:
+        /* User mode accesses just cause a SIGSEGV */
+        if (error_code & PF_USER) {
+                /*
+                 * It's possible to have interrupts off here.
+                 */
+                local_irq_enable();
+                if (is_prefetch(regs, address, error_code))
+                        return;
+                /* Work around K8 erratum #100 K8 in compat mode
+                   occasionally jumps to illegal addresses >4GB.  We
+                   catch this here in the page fault handler because
+                   these addresses are not reachable. Just detect this
+                   case and return.  Any code segment in LDT is
+                   compatibility mode. */
+                if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
+                    (address >> 32))
+                        return;
+                if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+                    printk_ratelimit()) {
+                        printk(
+                       "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
+                                        tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
+                                        tsk->comm, tsk->pid, address, regs->rip,
+                                        regs->rsp, error_code);
+                }
+       
+                tsk->thread.cr2 = address;
+                /* Kernel addresses are always protection faults */
+                tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+                tsk->thread.trap_no = 14;
+                info.si_signo = SIGSEGV;
+                info.si_errno = 0;
+                /* info.si_code has been set above */
+                info.si_addr = (void __user *)address;
+                force_sig_info(SIGSEGV, &info, tsk);
+                return;
+        }
+no_context:
+        
+        /* Are we prepared to handle this kernel fault?  */
+        fixup = search_exception_tables(regs->rip);
+        if (fixup) {
+                regs->rip = fixup->fixup;
+                return;
+        }
+        /* 
+         * Hall of shame of CPU/BIOS bugs.
+         */
+        if (is_prefetch(regs, address, error_code))
+                return;
+        if (is_errata93(regs, address))
+                return; 
+/*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+        flags = oops_begin();
+        if (address < PAGE_SIZE)
+                printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
+        else
+                printk(KERN_ALERT "Unable to handle kernel paging request");
+        printk(" at %016lx RIP: \n" KERN_ALERT,address);
+        printk_address(regs->rip);
+        dump_pagetable(address);
+        tsk->thread.cr2 = address;
+        tsk->thread.trap_no = 14;
+        tsk->thread.error_code = error_code;
+        __die("Oops", regs, error_code);
+        /* Executive summary in case the body of the oops scrolled away */
+        printk(KERN_EMERG "CR2: %016lx\n", address);
+        oops_end(flags);
+        do_exit(SIGKILL);
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+        up_read(&mm->mmap_sem);
+        if (is_init(current)) {
+                yield();
+                goto again;
+        }
+        printk("VM: killing process %s\n", tsk->comm);
+        if (error_code & 4)
+                do_group_exit(SIGKILL);
+        goto no_context;
+do_sigbus:
+        up_read(&mm->mmap_sem);
+        /* Kernel mode? Handle exceptions or die */
+        if (!(error_code & PF_USER))
+                goto no_context;
+        tsk->thread.cr2 = address;
+        tsk->thread.error_code = error_code;
+        tsk->thread.trap_no = 14;
+        info.si_signo = SIGBUS;
+        info.si_errno = 0;
+        info.si_code = BUS_ADRERR;
+        info.si_addr = (void __user *)address;
+        force_sig_info(SIGBUS, &info, tsk);
+        return;
+}
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+void vmalloc_sync_all(void)
+{
+        /* Note that races in the updates of insync and start aren't 
+           problematic:
+           insync can only get set bits added, and updates to start are only
+           improving performance (without affecting correctness if undone). */
+        static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+        static unsigned long start = VMALLOC_START & PGDIR_MASK;
+        unsigned long address;
+        for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+                if (!test_bit(pgd_index(address), insync)) {
+                        const pgd_t *pgd_ref = pgd_offset_k(address);
+                        struct page *page;
+                        if (pgd_none(*pgd_ref))
+                                continue;
+                        spin_lock(&pgd_lock);
+                        list_for_each_entry(page, &pgd_list, lru) {
+                                pgd_t *pgd;
+                                pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                                if (pgd_none(*pgd))
+                                        set_pgd(pgd, *pgd_ref);
+                                else
+                                        BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+                        }
+                        spin_unlock(&pgd_lock);
+                        set_bit(pgd_index(address), insync);
+                }
+                if (address == start)
+                        start = address + PGDIR_SIZE;
+        }
+        /* Check that there is no need to do the same for the modules area. */
+        BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+        BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 
+                                (__START_KERNEL & PGDIR_MASK)));
+}
+static int __init enable_pagefaulttrace(char *str)
+{
+        page_fault_trace = 1;
+        return 1;
+}
+__setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
new file mode 100644
index 000000000000..1c3bf95f7356
--- /dev/null
+++ b/arch/x86/mm/highmem_32.c
@@ -0,0 +1,113 @@
+#include <linux/highmem.h>
+#include <linux/module.h>
+void *kmap(struct page *page)
+{
+        might_sleep();
+        if (!PageHighMem(page))
+                return page_address(page);
+        return kmap_high(page);
+}
+void kunmap(struct page *page)
+{
+        if (in_interrupt())
+                BUG();
+        if (!PageHighMem(page))
+                return;
+        kunmap_high(page);
+}
+/*
+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
+ * no global lock is needed and because the kmap code must perform a global TLB
+ * invalidation when the kmap pool wraps.
+ *
+ * However when holding an atomic kmap is is not legal to sleep, so atomic
+ * kmaps are appropriate for short, tight code paths only.
+ */
+void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
+{
+        enum fixed_addresses idx;
+        unsigned long vaddr;
+        /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+        pagefault_disable();
+        if (!PageHighMem(page))
+                return page_address(page);
+        idx = type + KM_TYPE_NR*smp_processor_id();
+        vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+        BUG_ON(!pte_none(*(kmap_pte-idx)));
+        set_pte(kmap_pte-idx, mk_pte(page, prot));
+        arch_flush_lazy_mmu_mode();
+        return (void *)vaddr;
+}
+void *kmap_atomic(struct page *page, enum km_type type)
+{
+        return kmap_atomic_prot(page, type, kmap_prot);
+}
+void kunmap_atomic(void *kvaddr, enum km_type type)
+{
+        unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+        enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
+        /*
+         * Force other mappings to Oops if they'll try to access this pte
+         * without first remap it.  Keeping stale mappings around is a bad idea
+         * also, in case the page changes cacheability attributes or becomes
+         * a protected page in a hypervisor.
+         */
+        if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
+                kpte_clear_flush(kmap_pte-idx, vaddr);
+        else {
+#ifdef CONFIG_DEBUG_HIGHMEM
+                BUG_ON(vaddr < PAGE_OFFSET);
+                BUG_ON(vaddr >= (unsigned long)high_memory);
+#endif
+        }
+        arch_flush_lazy_mmu_mode();
+        pagefault_enable();
+}
+/* This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+{
+        enum fixed_addresses idx;
+        unsigned long vaddr;
+        pagefault_disable();
+        idx = type + KM_TYPE_NR*smp_processor_id();
+        vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+        set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
+        arch_flush_lazy_mmu_mode();
+        return (void*) vaddr;
+}
+struct page *kmap_atomic_to_page(void *ptr)
+{
+        unsigned long idx, vaddr = (unsigned long)ptr;
+        pte_t *pte;
+        if (vaddr < FIXADDR_START)
+                return virt_to_page(ptr);
+        idx = virt_to_fix(vaddr);
+        pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
+        return pte_page(*pte);
+}
+EXPORT_SYMBOL(kmap);
+EXPORT_SYMBOL(kunmap);
+EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_to_page);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
new file mode 100644
index 000000000000..6c06d9c0488e
--- /dev/null
+++ b/arch/x86/mm/hugetlbpage.c
@@ -0,0 +1,391 @@
+/*
+ * IA-32 Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+                                struct vm_area_struct *vma,
+                                unsigned long addr, pgoff_t idx)
+{
+        unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+                                svma->vm_start;
+        unsigned long sbase = saddr & PUD_MASK;
+        unsigned long s_end = sbase + PUD_SIZE;
+        /*
+         * match the virtual addresses, permission and the alignment of the
+         * page table page.
+         */
+        if (pmd_index(addr) != pmd_index(saddr) ||
+            vma->vm_flags != svma->vm_flags ||
+            sbase < svma->vm_start || svma->vm_end < s_end)
+                return 0;
+        return saddr;
+}
+static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+        unsigned long base = addr & PUD_MASK;
+        unsigned long end = base + PUD_SIZE;
+        /*
+         * check on proper vm_flags and page table alignment
+         */
+        if (vma->vm_flags & VM_MAYSHARE &&
+            vma->vm_start <= base && end <= vma->vm_end)
+                return 1;
+        return 0;
+}
+/*
+ * search for a shareable pmd page for hugetlb.
+ */
+static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+        struct vm_area_struct *vma = find_vma(mm, addr);
+        struct address_space *mapping = vma->vm_file->f_mapping;
+        pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+                        vma->vm_pgoff;
+        struct prio_tree_iter iter;
+        struct vm_area_struct *svma;
+        unsigned long saddr;
+        pte_t *spte = NULL;
+        if (!vma_shareable(vma, addr))
+                return;
+        spin_lock(&mapping->i_mmap_lock);
+        vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
+                if (svma == vma)
+                        continue;
+                saddr = page_table_shareable(svma, vma, addr, idx);
+                if (saddr) {
+                        spte = huge_pte_offset(svma->vm_mm, saddr);
+                        if (spte) {
+                                get_page(virt_to_page(spte));
+                                break;
+                        }
+                }
+        }
+        if (!spte)
+                goto out;
+        spin_lock(&mm->page_table_lock);
+        if (pud_none(*pud))
+                pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
+        else
+                put_page(virt_to_page(spte));
+        spin_unlock(&mm->page_table_lock);
+out:
+        spin_unlock(&mapping->i_mmap_lock);
+}
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * called with vma->vm_mm->page_table_lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ *          0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+        pgd_t *pgd = pgd_offset(mm, *addr);
+        pud_t *pud = pud_offset(pgd, *addr);
+        BUG_ON(page_count(virt_to_page(ptep)) == 0);
+        if (page_count(virt_to_page(ptep)) == 1)
+                return 0;
+        pud_clear(pud);
+        put_page(virt_to_page(ptep));
+        *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+        return 1;
+}
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pte_t *pte = NULL;
+        pgd = pgd_offset(mm, addr);
+        pud = pud_alloc(mm, pgd, addr);
+        if (pud) {
+                if (pud_none(*pud))
+                        huge_pmd_share(mm, addr, pud);
+                pte = (pte_t *) pmd_alloc(mm, pud, addr);
+        }
+        BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
+        return pte;
+}
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd = NULL;
+        pgd = pgd_offset(mm, addr);
+        if (pgd_present(*pgd)) {
+                pud = pud_offset(pgd, addr);
+                if (pud_present(*pud))
+                        pmd = pmd_offset(pud, addr);
+        }
+        return (pte_t *) pmd;
+}
+#if 0   /* This is just for testing */
+struct page *
+follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
+{
+        unsigned long start = address;
+        int length = 1;
+        int nr;
+        struct page *page;
+        struct vm_area_struct *vma;
+        vma = find_vma(mm, addr);
+        if (!vma || !is_vm_hugetlb_page(vma))
+                return ERR_PTR(-EINVAL);
+        pte = huge_pte_offset(mm, address);
+        /* hugetlb should be locked, and hence, prefaulted */
+        WARN_ON(!pte || pte_none(*pte));
+        page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+        WARN_ON(!PageCompound(page));
+        return page;
+}
+int pmd_huge(pmd_t pmd)
+{
+        return 0;
+}
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+                pmd_t *pmd, int write)
+{
+        return NULL;
+}
+#else
+struct page *
+follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
+{
+        return ERR_PTR(-EINVAL);
+}
+int pmd_huge(pmd_t pmd)
+{
+        return !!(pmd_val(pmd) & _PAGE_PSE);
+}
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+                pmd_t *pmd, int write)
+{
+        struct page *page;
+        page = pte_page(*(pte_t *)pmd);
+        if (page)
+                page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
+        return page;
+}
+#endif
+/* x86_64 also uses this file */
+#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
+                unsigned long addr, unsigned long len,
+                unsigned long pgoff, unsigned long flags)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma;
+        unsigned long start_addr;
+        if (len > mm->cached_hole_size) {
+                start_addr = mm->free_area_cache;
+        } else {
+                start_addr = TASK_UNMAPPED_BASE;
+                mm->cached_hole_size = 0;
+        }
+full_search:
+        addr = ALIGN(start_addr, HPAGE_SIZE);
+        for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+                /* At this point:  (!vma || addr < vma->vm_end). */
+                if (TASK_SIZE - len < addr) {
+                        /*
+                         * Start a new search - just in case we missed
+                         * some holes.
+                         */
+                        if (start_addr != TASK_UNMAPPED_BASE) {
+                                start_addr = TASK_UNMAPPED_BASE;
+                                mm->cached_hole_size = 0;
+                                goto full_search;
+                        }
+                        return -ENOMEM;
+                }
+                if (!vma || addr + len <= vma->vm_start) {
+                        mm->free_area_cache = addr + len;
+                        return addr;
+                }
+                if (addr + mm->cached_hole_size < vma->vm_start)
+                        mm->cached_hole_size = vma->vm_start - addr;
+                addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+        }
+}
+static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
+                unsigned long addr0, unsigned long len,
+                unsigned long pgoff, unsigned long flags)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma, *prev_vma;
+        unsigned long base = mm->mmap_base, addr = addr0;
+        unsigned long largest_hole = mm->cached_hole_size;
+        int first_time = 1;
+        /* don't allow allocations above current base */
+        if (mm->free_area_cache > base)
+                mm->free_area_cache = base;
+        if (len <= largest_hole) {
+                largest_hole = 0;
+                mm->free_area_cache  = base;
+        }
+try_again:
+        /* make sure it can fit in the remaining address space */
+        if (mm->free_area_cache < len)
+                goto fail;
+        /* either no address requested or cant fit in requested address hole */
+        addr = (mm->free_area_cache - len) & HPAGE_MASK;
+        do {
+                /*
+                 * Lookup failure means no vma is above this address,
+                 * i.e. return with success:
+                 */
+                if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
+                        return addr;
+                /*
+                 * new region fits between prev_vma->vm_end and
+                 * vma->vm_start, use it:
+                 */
+                if (addr + len <= vma->vm_start &&
+                            (!prev_vma || (addr >= prev_vma->vm_end))) {
+                        /* remember the address as a hint for next time */
+                        mm->cached_hole_size = largest_hole;
+                        return (mm->free_area_cache = addr);
+                } else {
+                        /* pull free_area_cache down to the first hole */
+                        if (mm->free_area_cache == vma->vm_end) {
+                                mm->free_area_cache = vma->vm_start;
+                                mm->cached_hole_size = largest_hole;
+                        }
+                }
+                /* remember the largest hole we saw so far */
+                if (addr + largest_hole < vma->vm_start)
+                        largest_hole = vma->vm_start - addr;
+                /* try just below the current vma->vm_start */
+                addr = (vma->vm_start - len) & HPAGE_MASK;
+        } while (len <= vma->vm_start);
+fail:
+        /*
+         * if hint left us with no space for the requested
+         * mapping then try again:
+         */
+        if (first_time) {
+                mm->free_area_cache = base;
+                largest_hole = 0;
+                first_time = 0;
+                goto try_again;
+        }
+        /*
+         * A failed mmap() very likely causes application failure,
+         * so fall back to the bottom-up function here. This scenario
+         * can happen with large stack limits and large mmap()
+         * allocations.
+         */
+        mm->free_area_cache = TASK_UNMAPPED_BASE;
+        mm->cached_hole_size = ~0UL;
+        addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
+                        len, pgoff, flags);
+        /*
+         * Restore the topdown base:
+         */
+        mm->free_area_cache = base;
+        mm->cached_hole_size = ~0UL;
+        return addr;
+}
+unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+                unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma;
+        if (len & ~HPAGE_MASK)
+                return -EINVAL;
+        if (len > TASK_SIZE)
+                return -ENOMEM;
+        if (flags & MAP_FIXED) {
+                if (prepare_hugepage_range(addr, len))
+                        return -EINVAL;
+                return addr;
+        }
+        if (addr) {
+                addr = ALIGN(addr, HPAGE_SIZE);
+                vma = find_vma(mm, addr);
+                if (TASK_SIZE - len >= addr &&
+                    (!vma || addr + len <= vma->vm_start))
+                        return addr;
+        }
+        if (mm->get_unmapped_area == arch_get_unmapped_area)
+                return hugetlb_get_unmapped_area_bottomup(file, addr, len,
+                                pgoff, flags);
+        else
+                return hugetlb_get_unmapped_area_topdown(file, addr, len,
+                                pgoff, flags);
+}
+#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
new file mode 100644
index 000000000000..730a5b177b1f
--- /dev/null
+++ b/arch/x86/mm/init_32.c
@@ -0,0 +1,858 @@
+/*
+ *  linux/arch/i386/mm/init.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+#include <linux/module.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/efi.h>
+#include <linux/memory_hotplug.h>
+#include <linux/initrd.h>
+#include <linux/cpumask.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/paravirt.h>
+unsigned int __VMALLOC_RESERVE = 128 << 20;
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+unsigned long highstart_pfn, highend_pfn;
+static int noinline do_test_wp_bit(void);
+/*
+ * Creates a middle page table and puts a pointer to it in the
+ * given global directory entry. This only returns the gd entry
+ * in non-PAE compilation mode, since the middle layer is folded.
+ */
+static pmd_t * __init one_md_table_init(pgd_t *pgd)
+{
+        pud_t *pud;
+        pmd_t *pmd_table;
+                
+#ifdef CONFIG_X86_PAE
+        if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
+                pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+                paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
+                set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+                pud = pud_offset(pgd, 0);
+                if (pmd_table != pmd_offset(pud, 0))
+                        BUG();
+        }
+#endif
+        pud = pud_offset(pgd, 0);
+        pmd_table = pmd_offset(pud, 0);
+        return pmd_table;
+}
+/*
+ * Create a page table and place a pointer to it in a middle page
+ * directory entry.
+ */
+static pte_t * __init one_page_table_init(pmd_t *pmd)
+{
+        if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
+                pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+                paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+                set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+                BUG_ON(page_table != pte_offset_kernel(pmd, 0));
+        }
+        
+        return pte_offset_kernel(pmd, 0);
+}
+/*
+ * This function initializes a certain range of kernel virtual memory 
+ * with new bootmem page tables, everywhere page tables are missing in
+ * the given range.
+ */
+/*
+ * NOTE: The pagetables are allocated contiguous on the physical space 
+ * so we can cache the place of the first one and move around without 
+ * checking the pgd every time.
+ */
+static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
+{
+        pgd_t *pgd;
+        pmd_t *pmd;
+        int pgd_idx, pmd_idx;
+        unsigned long vaddr;
+        vaddr = start;
+        pgd_idx = pgd_index(vaddr);
+        pmd_idx = pmd_index(vaddr);
+        pgd = pgd_base + pgd_idx;
+        for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
+                pmd = one_md_table_init(pgd);
+                pmd = pmd + pmd_index(vaddr);
+                for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
+                        one_page_table_init(pmd);
+                        vaddr += PMD_SIZE;
+                }
+                pmd_idx = 0;
+        }
+}
+static inline int is_kernel_text(unsigned long addr)
+{
+        if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+                return 1;
+        return 0;
+}
+/*
+ * This maps the physical memory to kernel virtual address space, a total 
+ * of max_low_pfn pages, by creating page tables starting from address 
+ * PAGE_OFFSET.
+ */
+static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
+{
+        unsigned long pfn;
+        pgd_t *pgd;
+        pmd_t *pmd;
+        pte_t *pte;
+        int pgd_idx, pmd_idx, pte_ofs;
+        pgd_idx = pgd_index(PAGE_OFFSET);
+        pgd = pgd_base + pgd_idx;
+        pfn = 0;
+        for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+                pmd = one_md_table_init(pgd);
+                if (pfn >= max_low_pfn)
+                        continue;
+                for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
+                        unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
+                        /* Map with big pages if possible, otherwise create normal page tables. */
+                        if (cpu_has_pse) {
+                                unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
+                                if (is_kernel_text(address) || is_kernel_text(address2))
+                                        set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
+                                else
+                                        set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
+                                pfn += PTRS_PER_PTE;
+                        } else {
+                                pte = one_page_table_init(pmd);
+                                for (pte_ofs = 0;
+                                     pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+                                     pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
+                                        if (is_kernel_text(address))
+                                                set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
+                                        else
+                                                set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
+                                }
+                        }
+                }
+        }
+}
+static inline int page_kills_ppro(unsigned long pagenr)
+{
+        if (pagenr >= 0x70000 && pagenr <= 0x7003F)
+                return 1;
+        return 0;
+}
+int page_is_ram(unsigned long pagenr)
+{
+        int i;
+        unsigned long addr, end;
+        if (efi_enabled) {
+                efi_memory_desc_t *md;
+                void *p;
+                for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+                        md = p;
+                        if (!is_available_memory(md))
+                                continue;
+                        addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+                        end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
+                        if ((pagenr >= addr) && (pagenr < end))
+                                return 1;
+                }
+                return 0;
+        }
+        for (i = 0; i < e820.nr_map; i++) {
+                if (e820.map[i].type != E820_RAM)       /* not usable memory */
+                        continue;
+                /*
+                 *      !!!FIXME!!! Some BIOSen report areas as RAM that
+                 *      are not. Notably the 640->1Mb area. We need a sanity
+                 *      check here.
+                 */
+                addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+                end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
+                if  ((pagenr >= addr) && (pagenr < end))
+                        return 1;
+        }
+        return 0;
+}
+#ifdef CONFIG_HIGHMEM
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+#define kmap_get_fixmap_pte(vaddr)                                      \
+        pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
+static void __init kmap_init(void)
+{
+        unsigned long kmap_vstart;
+        /* cache the first kmap pte */
+        kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
+        kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+        kmap_prot = PAGE_KERNEL;
+}
+static void __init permanent_kmaps_init(pgd_t *pgd_base)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        unsigned long vaddr;
+        vaddr = PKMAP_BASE;
+        page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+        pgd = swapper_pg_dir + pgd_index(vaddr);
+        pud = pud_offset(pgd, vaddr);
+        pmd = pmd_offset(pud, vaddr);
+        pte = pte_offset_kernel(pmd, vaddr);
+        pkmap_page_table = pte; 
+}
+static void __meminit free_new_highpage(struct page *page)
+{
+        init_page_count(page);
+        __free_page(page);
+        totalhigh_pages++;
+}
+void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+{
+        if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
+                ClearPageReserved(page);
+                free_new_highpage(page);
+        } else
+                SetPageReserved(page);
+}
+static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
+{
+        free_new_highpage(page);
+        totalram_pages++;
+#ifdef CONFIG_FLATMEM
+        max_mapnr = max(pfn, max_mapnr);
+#endif
+        num_physpages++;
+        return 0;
+}
+/*
+ * Not currently handling the NUMA case.
+ * Assuming single node and all memory that
+ * has been added dynamically that would be
+ * onlined here is in HIGHMEM
+ */
+void __meminit online_page(struct page *page)
+{
+        ClearPageReserved(page);
+        add_one_highpage_hotplug(page, page_to_pfn(page));
+}
+#ifdef CONFIG_NUMA
+extern void set_highmem_pages_init(int);
+#else
+static void __init set_highmem_pages_init(int bad_ppro)
+{
+        int pfn;
+        for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
+                add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+        totalram_pages += totalhigh_pages;
+}
+#endif /* CONFIG_FLATMEM */
+#else
+#define kmap_init() do { } while (0)
+#define permanent_kmaps_init(pgd_base) do { } while (0)
+#define set_highmem_pages_init(bad_ppro) do { } while (0)
+#endif /* CONFIG_HIGHMEM */
+unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
+EXPORT_SYMBOL(__PAGE_KERNEL);
+unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
+#ifdef CONFIG_NUMA
+extern void __init remap_numa_kva(void);
+#else
+#define remap_numa_kva() do {} while (0)
+#endif
+void __init native_pagetable_setup_start(pgd_t *base)
+{
+#ifdef CONFIG_X86_PAE
+        int i;
+        /*
+         * Init entries of the first-level page table to the
+         * zero page, if they haven't already been set up.
+         *
+         * In a normal native boot, we'll be running on a
+         * pagetable rooted in swapper_pg_dir, but not in PAE
+         * mode, so this will end up clobbering the mappings
+         * for the lower 24Mbytes of the address space,
+         * without affecting the kernel address space.
+         */
+        for (i = 0; i < USER_PTRS_PER_PGD; i++)
+                set_pgd(&base[i],
+                        __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
+        /* Make sure kernel address space is empty so that a pagetable
+           will be allocated for it. */
+        memset(&base[USER_PTRS_PER_PGD], 0,
+               KERNEL_PGD_PTRS * sizeof(pgd_t));
+#else
+        paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
+#endif
+}
+void __init native_pagetable_setup_done(pgd_t *base)
+{
+#ifdef CONFIG_X86_PAE
+        /*
+         * Add low memory identity-mappings - SMP needs it when
+         * starting up on an AP from real-mode. In the non-PAE
+         * case we already have these mappings through head.S.
+         * All user-space mappings are explicitly cleared after
+         * SMP startup.
+         */
+        set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
+#endif
+}
+/*
+ * Build a proper pagetable for the kernel mappings.  Up until this
+ * point, we've been running on some set of pagetables constructed by
+ * the boot process.
+ *
+ * If we're booting on native hardware, this will be a pagetable
+ * constructed in arch/i386/kernel/head.S, and not running in PAE mode
+ * (even if we'll end up running in PAE).  The root of the pagetable
+ * will be swapper_pg_dir.
+ *
+ * If we're booting paravirtualized under a hypervisor, then there are
+ * more options: we may already be running PAE, and the pagetable may
+ * or may not be based in swapper_pg_dir.  In any case,
+ * paravirt_pagetable_setup_start() will set up swapper_pg_dir
+ * appropriately for the rest of the initialization to work.
+ *
+ * In general, pagetable_init() assumes that the pagetable may already
+ * be partially populated, and so it avoids stomping on any existing
+ * mappings.
+ */
+static void __init pagetable_init (void)
+{
+        unsigned long vaddr, end;
+        pgd_t *pgd_base = swapper_pg_dir;
+        paravirt_pagetable_setup_start(pgd_base);
+        /* Enable PSE if available */
+        if (cpu_has_pse)
+                set_in_cr4(X86_CR4_PSE);
+        /* Enable PGE if available */
+        if (cpu_has_pge) {
+                set_in_cr4(X86_CR4_PGE);
+                __PAGE_KERNEL |= _PAGE_GLOBAL;
+                __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
+        }
+        kernel_physical_mapping_init(pgd_base);
+        remap_numa_kva();
+        /*
+         * Fixed mappings, only the page table structure has to be
+         * created - mappings will be set by set_fixmap():
+         */
+        vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+        end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
+        page_table_range_init(vaddr, end, pgd_base);
+        permanent_kmaps_init(pgd_base);
+        paravirt_pagetable_setup_done(pgd_base);
+}
+#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI)
+/*
+ * Swap suspend & friends need this for resume because things like the intel-agp
+ * driver might have split up a kernel 4MB mapping.
+ */
+char __nosavedata swsusp_pg_dir[PAGE_SIZE]
+        __attribute__ ((aligned (PAGE_SIZE)));
+static inline void save_pg_dir(void)
+{
+        memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
+}
+#else
+static inline void save_pg_dir(void)
+{
+}
+#endif
+void zap_low_mappings (void)
+{
+        int i;
+        save_pg_dir();
+        /*
+         * Zap initial low-memory mappings.
+         *
+         * Note that "pgd_clear()" doesn't do it for
+         * us, because pgd_clear() is a no-op on i386.
+         */
+        for (i = 0; i < USER_PTRS_PER_PGD; i++)
+#ifdef CONFIG_X86_PAE
+                set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
+#else
+                set_pgd(swapper_pg_dir+i, __pgd(0));
+#endif
+        flush_tlb_all();
+}
+int nx_enabled = 0;
+#ifdef CONFIG_X86_PAE
+static int disable_nx __initdata = 0;
+u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+/*
+ * noexec = on|off
+ *
+ * Control non executable mappings.
+ *
+ * on      Enable
+ * off     Disable
+ */
+static int __init noexec_setup(char *str)
+{
+        if (!str || !strcmp(str, "on")) {
+                if (cpu_has_nx) {
+                        __supported_pte_mask |= _PAGE_NX;
+                        disable_nx = 0;
+                }
+        } else if (!strcmp(str,"off")) {
+                disable_nx = 1;
+                __supported_pte_mask &= ~_PAGE_NX;
+        } else
+                return -EINVAL;
+        return 0;
+}
+early_param("noexec", noexec_setup);
+static void __init set_nx(void)
+{
+        unsigned int v[4], l, h;
+        if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+                cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+                if ((v[3] & (1 << 20)) && !disable_nx) {
+                        rdmsr(MSR_EFER, l, h);
+                        l |= EFER_NX;
+                        wrmsr(MSR_EFER, l, h);
+                        nx_enabled = 1;
+                        __supported_pte_mask |= _PAGE_NX;
+                }
+        }
+}
+/*
+ * Enables/disables executability of a given kernel page and
+ * returns the previous setting.
+ */
+int __init set_kernel_exec(unsigned long vaddr, int enable)
+{
+        pte_t *pte;
+        int ret = 1;
+        if (!nx_enabled)
+                goto out;
+        pte = lookup_address(vaddr);
+        BUG_ON(!pte);
+        if (!pte_exec_kernel(*pte))
+                ret = 0;
+        if (enable)
+                pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
+        else
+                pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
+        pte_update_defer(&init_mm, vaddr, pte);
+        __flush_tlb_all();
+out:
+        return ret;
+}
+#endif
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+#ifdef CONFIG_X86_PAE
+        set_nx();
+        if (nx_enabled)
+                printk("NX (Execute Disable) protection: active\n");
+#endif
+        pagetable_init();
+        load_cr3(swapper_pg_dir);
+#ifdef CONFIG_X86_PAE
+        /*
+         * We will bail out later - printk doesn't work right now so
+         * the user would just see a hanging kernel.
+         */
+        if (cpu_has_pae)
+                set_in_cr4(X86_CR4_PAE);
+#endif
+        __flush_tlb_all();
+        kmap_init();
+}
+/*
+ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
+ * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
+ * used to involve black magic jumps to work around some nasty CPU bugs,
+ * but fortunately the switch to using exceptions got rid of all that.
+ */
+static void __init test_wp_bit(void)
+{
+        printk("Checking if this processor honours the WP bit even in supervisor mode... ");
+        /* Any page-aligned address will do, the test is non-destructive */
+        __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
+        boot_cpu_data.wp_works_ok = do_test_wp_bit();
+        clear_fixmap(FIX_WP_TEST);
+        if (!boot_cpu_data.wp_works_ok) {
+                printk("No.\n");
+#ifdef CONFIG_X86_WP_WORKS_OK
+                panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
+#endif
+        } else {
+                printk("Ok.\n");
+        }
+}
+static struct kcore_list kcore_mem, kcore_vmalloc; 
+void __init mem_init(void)
+{
+        extern int ppro_with_ram_bug(void);
+        int codesize, reservedpages, datasize, initsize;
+        int tmp;
+        int bad_ppro;
+#ifdef CONFIG_FLATMEM
+        BUG_ON(!mem_map);
+#endif
+        
+        bad_ppro = ppro_with_ram_bug();
+#ifdef CONFIG_HIGHMEM
+        /* check that fixmap and pkmap do not overlap */
+        if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
+                printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
+                printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
+                                PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
+                BUG();
+        }
+#endif
+ 
+        /* this will put all low memory onto the freelists */
+        totalram_pages += free_all_bootmem();
+        reservedpages = 0;
+        for (tmp = 0; tmp < max_low_pfn; tmp++)
+                /*
+                 * Only count reserved RAM pages
+                 */
+                if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
+                        reservedpages++;
+        set_highmem_pages_init(bad_ppro);
+        codesize =  (unsigned long) &_etext - (unsigned long) &_text;
+        datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
+        initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
+        kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
+        kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
+                   VMALLOC_END-VMALLOC_START);
+        printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
+                (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+                num_physpages << (PAGE_SHIFT-10),
+                codesize >> 10,
+                reservedpages << (PAGE_SHIFT-10),
+                datasize >> 10,
+                initsize >> 10,
+                (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
+               );
+#if 1 /* double-sanity-check paranoia */
+        printk("virtual kernel memory layout:\n"
+               "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#ifdef CONFIG_HIGHMEM
+               "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#endif
+               "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+               "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+               "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+               "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+               "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
+               FIXADDR_START, FIXADDR_TOP,
+               (FIXADDR_TOP - FIXADDR_START) >> 10,
+#ifdef CONFIG_HIGHMEM
+               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+               (LAST_PKMAP*PAGE_SIZE) >> 10,
+#endif
+               VMALLOC_START, VMALLOC_END,
+               (VMALLOC_END - VMALLOC_START) >> 20,
+               (unsigned long)__va(0), (unsigned long)high_memory,
+               ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+               (unsigned long)&__init_begin, (unsigned long)&__init_end,
+               ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
+               (unsigned long)&_etext, (unsigned long)&_edata,
+               ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+               (unsigned long)&_text, (unsigned long)&_etext,
+               ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+#ifdef CONFIG_HIGHMEM
+        BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+        BUG_ON(VMALLOC_END                     > PKMAP_BASE);
+#endif
+        BUG_ON(VMALLOC_START                   > VMALLOC_END);
+        BUG_ON((unsigned long)high_memory      > VMALLOC_START);
+#endif /* double-sanity-check paranoia */
+#ifdef CONFIG_X86_PAE
+        if (!cpu_has_pae)
+                panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
+#endif
+        if (boot_cpu_data.wp_works_ok < 0)
+                test_wp_bit();
+        /*
+         * Subtle. SMP is doing it's boot stuff late (because it has to
+         * fork idle threads) - but it also needs low mappings for the
+         * protected-mode entry to work. We zap these entries only after
+         * the WP-bit has been tested.
+         */
+#ifndef CONFIG_SMP
+        zap_low_mappings();
+#endif
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+        struct pglist_data *pgdata = NODE_DATA(nid);
+        struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
+        unsigned long start_pfn = start >> PAGE_SHIFT;
+        unsigned long nr_pages = size >> PAGE_SHIFT;
+        return __add_pages(zone, start_pfn, nr_pages);
+}
+int remove_memory(u64 start, u64 size)
+{
+        return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+#endif
+struct kmem_cache *pmd_cache;
+void __init pgtable_cache_init(void)
+{
+        size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
+        if (PTRS_PER_PMD > 1) {
+                pmd_cache = kmem_cache_create("pmd",
+                                        PTRS_PER_PMD*sizeof(pmd_t),
+                                        PTRS_PER_PMD*sizeof(pmd_t),
+                                        SLAB_PANIC,
+                                        pmd_ctor);
+                if (!SHARED_KERNEL_PMD) {
+                        /* If we're in PAE mode and have a non-shared
+                           kernel pmd, then the pgd size must be a
+                           page size.  This is because the pgd_list
+                           links through the page structure, so there
+                           can only be one pgd per page for this to
+                           work. */
+                        pgd_size = PAGE_SIZE;
+                }
+        }
+}
+/*
+ * This function cannot be __init, since exceptions don't work in that
+ * section.  Put this after the callers, so that it cannot be inlined.
+ */
+static int noinline do_test_wp_bit(void)
+{
+        char tmp_reg;
+        int flag;
+        __asm__ __volatile__(
+                "       movb %0,%1      \n"
+                "1:     movb %1,%0      \n"
+                "       xorl %2,%2      \n"
+                "2:                     \n"
+                ".section __ex_table,\"a\"\n"
+                "       .align 4        \n"
+                "       .long 1b,2b     \n"
+                ".previous              \n"
+                :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
+                 "=q" (tmp_reg),
+                 "=r" (flag)
+                :"2" (1)
+                :"memory");
+        
+        return flag;
+}
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void)
+{
+        unsigned long start = PFN_ALIGN(_text);
+        unsigned long size = PFN_ALIGN(_etext) - start;
+#ifndef CONFIG_KPROBES
+#ifdef CONFIG_HOTPLUG_CPU
+        /* It must still be possible to apply SMP alternatives. */
+        if (num_possible_cpus() <= 1)
+#endif
+        {
+                change_page_attr(virt_to_page(start),
+                                 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
+                printk("Write protecting the kernel text: %luk\n", size >> 10);
+        }
+#endif
+        start += size;
+        size = (unsigned long)__end_rodata - start;
+        change_page_attr(virt_to_page(start),
+                         size >> PAGE_SHIFT, PAGE_KERNEL_RO);
+        printk("Write protecting the kernel read-only data: %luk\n",
+               size >> 10);
+        /*
+         * change_page_attr() requires a global_flush_tlb() call after it.
+         * We do this after the printk so that if something went wrong in the
+         * change, the printk gets out at least to give a better debug hint
+         * of who is the culprit.
+         */
+        global_flush_tlb();
+}
+#endif
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+        unsigned long addr;
+        for (addr = begin; addr < end; addr += PAGE_SIZE) {
+                ClearPageReserved(virt_to_page(addr));
+                init_page_count(virt_to_page(addr));
+                memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
+                free_page(addr);
+                totalram_pages++;
+        }
+        printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+}
+void free_initmem(void)
+{
+        free_init_pages("unused kernel memory",
+                        (unsigned long)(&__init_begin),
+                        (unsigned long)(&__init_end));
+}
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+        free_init_pages("initrd memory", start, end);
+}
+#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
new file mode 100644
index 000000000000..458893b376f8
--- /dev/null
+++ b/arch/x86/mm/init_64.c
@@ -0,0 +1,750 @@
+/*
+ *  linux/arch/x86_64/mm/init.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
+ */
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/proc_fs.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+#include <linux/memory_hotplug.h>
+#include <linux/nmi.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/sections.h>
+#ifndef Dprintk
+#define Dprintk(x...)
+#endif
+const struct dma_mapping_ops* dma_ops;
+EXPORT_SYMBOL(dma_ops);
+static unsigned long dma_reserve __initdata;
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+/*
+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
+ * physical space so we can cache the place of the first one and move
+ * around without checking the pgd every time.
+ */
+void show_mem(void)
+{
+        long i, total = 0, reserved = 0;
+        long shared = 0, cached = 0;
+        pg_data_t *pgdat;
+        struct page *page;
+        printk(KERN_INFO "Mem-info:\n");
+        show_free_areas();
+        printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+        for_each_online_pgdat(pgdat) {
+               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+                        /* this loop can take a while with 256 GB and 4k pages
+                           so update the NMI watchdog */
+                        if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
+                                touch_nmi_watchdog();
+                        }
+                        if (!pfn_valid(pgdat->node_start_pfn + i))
+                                continue;
+                        page = pfn_to_page(pgdat->node_start_pfn + i);
+                        total++;
+                        if (PageReserved(page))
+                                reserved++;
+                        else if (PageSwapCache(page))
+                                cached++;
+                        else if (page_count(page))
+                                shared += page_count(page) - 1;
+               }
+        }
+        printk(KERN_INFO "%lu pages of RAM\n", total);
+        printk(KERN_INFO "%lu reserved pages\n",reserved);
+        printk(KERN_INFO "%lu pages shared\n",shared);
+        printk(KERN_INFO "%lu pages swap cached\n",cached);
+}
+int after_bootmem;
+static __init void *spp_getpage(void)
+{ 
+        void *ptr;
+        if (after_bootmem)
+                ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
+        else
+                ptr = alloc_bootmem_pages(PAGE_SIZE);
+        if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
+                panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
+        Dprintk("spp_getpage %p\n", ptr);
+        return ptr;
+} 
+static __init void set_pte_phys(unsigned long vaddr,
+                         unsigned long phys, pgprot_t prot)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte, new_pte;
+        Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
+        pgd = pgd_offset_k(vaddr);
+        if (pgd_none(*pgd)) {
+                printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
+                return;
+        }
+        pud = pud_offset(pgd, vaddr);
+        if (pud_none(*pud)) {
+                pmd = (pmd_t *) spp_getpage(); 
+                set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+                if (pmd != pmd_offset(pud, 0)) {
+                        printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
+                        return;
+                }
+        }
+        pmd = pmd_offset(pud, vaddr);
+        if (pmd_none(*pmd)) {
+                pte = (pte_t *) spp_getpage();
+                set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+                if (pte != pte_offset_kernel(pmd, 0)) {
+                        printk("PAGETABLE BUG #02!\n");
+                        return;
+                }
+        }
+        new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
+        pte = pte_offset_kernel(pmd, vaddr);
+        if (!pte_none(*pte) &&
+            pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
+                pte_ERROR(*pte);
+        set_pte(pte, new_pte);
+        /*
+         * It's enough to flush this one mapping.
+         * (PGE mappings get flushed as well)
+         */
+        __flush_tlb_one(vaddr);
+}
+/* NOTE: this is meant to be run only at boot */
+void __init 
+__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+{
+        unsigned long address = __fix_to_virt(idx);
+        if (idx >= __end_of_fixed_addresses) {
+                printk("Invalid __set_fixmap\n");
+                return;
+        }
+        set_pte_phys(address, phys, prot);
+}
+unsigned long __meminitdata table_start, table_end;
+static __meminit void *alloc_low_page(unsigned long *phys)
+{ 
+        unsigned long pfn = table_end++;
+        void *adr;
+        if (after_bootmem) {
+                adr = (void *)get_zeroed_page(GFP_ATOMIC);
+                *phys = __pa(adr);
+                return adr;
+        }
+        if (pfn >= end_pfn) 
+                panic("alloc_low_page: ran out of memory"); 
+        adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
+        memset(adr, 0, PAGE_SIZE);
+        *phys  = pfn * PAGE_SIZE;
+        return adr;
+}
+static __meminit void unmap_low_page(void *adr)
+{ 
+        if (after_bootmem)
+                return;
+        early_iounmap(adr, PAGE_SIZE);
+} 
+/* Must run before zap_low_mappings */
+__meminit void *early_ioremap(unsigned long addr, unsigned long size)
+{
+        unsigned long vaddr;
+        pmd_t *pmd, *last_pmd;
+        int i, pmds;
+        pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
+        vaddr = __START_KERNEL_map;
+        pmd = level2_kernel_pgt;
+        last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
+        for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
+                for (i = 0; i < pmds; i++) {
+                        if (pmd_present(pmd[i]))
+                                goto next;
+                }
+                vaddr += addr & ~PMD_MASK;
+                addr &= PMD_MASK;
+                for (i = 0; i < pmds; i++, addr += PMD_SIZE)
+                        set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
+                __flush_tlb();
+                return (void *)vaddr;
+        next:
+                ;
+        }
+        printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
+        return NULL;
+}
+/* To avoid virtual aliases later */
+__meminit void early_iounmap(void *addr, unsigned long size)
+{
+        unsigned long vaddr;
+        pmd_t *pmd;
+        int i, pmds;
+        vaddr = (unsigned long)addr;
+        pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
+        pmd = level2_kernel_pgt + pmd_index(vaddr);
+        for (i = 0; i < pmds; i++)
+                pmd_clear(pmd + i);
+        __flush_tlb();
+}
+static void __meminit
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
+{
+        int i = pmd_index(address);
+        for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
+                unsigned long entry;
+                pmd_t *pmd = pmd_page + pmd_index(address);
+                if (address >= end) {
+                        if (!after_bootmem)
+                                for (; i < PTRS_PER_PMD; i++, pmd++)
+                                        set_pmd(pmd, __pmd(0));
+                        break;
+                }
+                if (pmd_val(*pmd))
+                        continue;
+                entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
+                entry &= __supported_pte_mask;
+                set_pmd(pmd, __pmd(entry));
+        }
+}
+static void __meminit
+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+{
+        pmd_t *pmd = pmd_offset(pud,0);
+        spin_lock(&init_mm.page_table_lock);
+        phys_pmd_init(pmd, address, end);
+        spin_unlock(&init_mm.page_table_lock);
+        __flush_tlb_all();
+}
+static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+{ 
+        int i = pud_index(addr);
+        for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
+                unsigned long pmd_phys;
+                pud_t *pud = pud_page + pud_index(addr);
+                pmd_t *pmd;
+                if (addr >= end)
+                        break;
+                if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
+                        set_pud(pud, __pud(0)); 
+                        continue;
+                } 
+                if (pud_val(*pud)) {
+                        phys_pmd_update(pud, addr, end);
+                        continue;
+                }
+                pmd = alloc_low_page(&pmd_phys);
+                spin_lock(&init_mm.page_table_lock);
+                set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
+                phys_pmd_init(pmd, addr, end);
+                spin_unlock(&init_mm.page_table_lock);
+                unmap_low_page(pmd);
+        }
+        __flush_tlb();
+} 
+static void __init find_early_table_space(unsigned long end)
+{
+        unsigned long puds, pmds, tables, start;
+        puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+        pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+        tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
+                 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+        /* RED-PEN putting page tables only on node 0 could
+           cause a hotspot and fill up ZONE_DMA. The page tables
+           need roughly 0.5KB per GB. */
+        start = 0x8000;
+        table_start = find_e820_area(start, end, tables);
+        if (table_start == -1UL)
+                panic("Cannot find space for the kernel page tables");
+        table_start >>= PAGE_SHIFT;
+        table_end = table_start;
+        early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
+                end, table_start << PAGE_SHIFT,
+                (table_start << PAGE_SHIFT) + tables);
+}
+/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
+   This runs before bootmem is initialized and gets pages directly from the 
+   physical memory. To access them they are temporarily mapped. */
+void __meminit init_memory_mapping(unsigned long start, unsigned long end)
+{ 
+        unsigned long next; 
+        Dprintk("init_memory_mapping\n");
+        /* 
+         * Find space for the kernel direct mapping tables.
+         * Later we should allocate these tables in the local node of the memory
+         * mapped.  Unfortunately this is done currently before the nodes are 
+         * discovered.
+         */
+        if (!after_bootmem)
+                find_early_table_space(end);
+        start = (unsigned long)__va(start);
+        end = (unsigned long)__va(end);
+        for (; start < end; start = next) {
+                unsigned long pud_phys; 
+                pgd_t *pgd = pgd_offset_k(start);
+                pud_t *pud;
+                if (after_bootmem)
+                        pud = pud_offset(pgd, start & PGDIR_MASK);
+                else
+                        pud = alloc_low_page(&pud_phys);
+                next = start + PGDIR_SIZE;
+                if (next > end) 
+                        next = end; 
+                phys_pud_init(pud, __pa(start), __pa(next));
+                if (!after_bootmem)
+                        set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+                unmap_low_page(pud);
+        } 
+        if (!after_bootmem)
+                mmu_cr4_features = read_cr4();
+        __flush_tlb_all();
+}
+#ifndef CONFIG_NUMA
+void __init paging_init(void)
+{
+        unsigned long max_zone_pfns[MAX_NR_ZONES];
+        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+        max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+        max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+        max_zone_pfns[ZONE_NORMAL] = end_pfn;
+        memory_present(0, 0, end_pfn);
+        sparse_init();
+        free_area_init_nodes(max_zone_pfns);
+}
+#endif
+/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
+   from the CPU leading to inconsistent cache lines. address and size
+   must be aligned to 2MB boundaries. 
+   Does nothing when the mapping doesn't exist. */
+void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
+{
+        unsigned long end = address + size;
+        BUG_ON(address & ~LARGE_PAGE_MASK);
+        BUG_ON(size & ~LARGE_PAGE_MASK); 
+        
+        for (; address < end; address += LARGE_PAGE_SIZE) { 
+                pgd_t *pgd = pgd_offset_k(address);
+                pud_t *pud;
+                pmd_t *pmd;
+                if (pgd_none(*pgd))
+                        continue;
+                pud = pud_offset(pgd, address);
+                if (pud_none(*pud))
+                        continue; 
+                pmd = pmd_offset(pud, address);
+                if (!pmd || pmd_none(*pmd))
+                        continue; 
+                if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
+                        /* Could handle this, but it should not happen currently. */
+                        printk(KERN_ERR 
+               "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
+                        pmd_ERROR(*pmd); 
+                }
+                set_pmd(pmd, __pmd(0));                 
+        }
+        __flush_tlb_all();
+} 
+/*
+ * Memory hotplug specific functions
+ */
+void online_page(struct page *page)
+{
+        ClearPageReserved(page);
+        init_page_count(page);
+        __free_page(page);
+        totalram_pages++;
+        num_physpages++;
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Memory is added always to NORMAL zone. This means you will never get
+ * additional DMA/DMA32 memory.
+ */
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+        struct pglist_data *pgdat = NODE_DATA(nid);
+        struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
+        unsigned long start_pfn = start >> PAGE_SHIFT;
+        unsigned long nr_pages = size >> PAGE_SHIFT;
+        int ret;
+        init_memory_mapping(start, (start + size -1));
+        ret = __add_pages(zone, start_pfn, nr_pages);
+        if (ret)
+                goto error;
+        return ret;
+error:
+        printk("%s: Problem encountered in __add_pages!\n", __func__);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(arch_add_memory);
+int remove_memory(u64 start, u64 size)
+{
+        return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
+int memory_add_physaddr_to_nid(u64 start)
+{
+        return 0;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+#endif /* CONFIG_MEMORY_HOTPLUG */
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+/*
+ * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
+ * just online the pages.
+ */
+int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+{
+        int err = -EIO;
+        unsigned long pfn;
+        unsigned long total = 0, mem = 0;
+        for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+                if (pfn_valid(pfn)) {
+                        online_page(pfn_to_page(pfn));
+                        err = 0;
+                        mem++;
+                }
+                total++;
+        }
+        if (!err) {
+                z->spanned_pages += total;
+                z->present_pages += mem;
+                z->zone_pgdat->node_spanned_pages += total;
+                z->zone_pgdat->node_present_pages += mem;
+        }
+        return err;
+}
+#endif
+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
+                         kcore_vsyscall;
+void __init mem_init(void)
+{
+        long codesize, reservedpages, datasize, initsize;
+        pci_iommu_alloc();
+        /* clear the zero-page */
+        memset(empty_zero_page, 0, PAGE_SIZE);
+        reservedpages = 0;
+        /* this will put all low memory onto the freelists */
+#ifdef CONFIG_NUMA
+        totalram_pages = numa_free_all_bootmem();
+#else
+        totalram_pages = free_all_bootmem();
+#endif
+        reservedpages = end_pfn - totalram_pages -
+                                        absent_pages_in_range(0, end_pfn);
+        after_bootmem = 1;
+        codesize =  (unsigned long) &_etext - (unsigned long) &_text;
+        datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
+        initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
+        /* Register memory areas for /proc/kcore */
+        kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
+        kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
+                   VMALLOC_END-VMALLOC_START);
+        kclist_add(&kcore_kernel, &_stext, _end - _stext);
+        kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
+        kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
+                                 VSYSCALL_END - VSYSCALL_START);
+        printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
+                (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+                end_pfn << (PAGE_SHIFT-10),
+                codesize >> 10,
+                reservedpages << (PAGE_SHIFT-10),
+                datasize >> 10,
+                initsize >> 10);
+}
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+        unsigned long addr;
+        if (begin >= end)
+                return;
+        printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+        for (addr = begin; addr < end; addr += PAGE_SIZE) {
+                ClearPageReserved(virt_to_page(addr));
+                init_page_count(virt_to_page(addr));
+                memset((void *)(addr & ~(PAGE_SIZE-1)),
+                        POISON_FREE_INITMEM, PAGE_SIZE);
+                if (addr >= __START_KERNEL_map)
+                        change_page_attr_addr(addr, 1, __pgprot(0));
+                free_page(addr);
+                totalram_pages++;
+        }
+        if (addr > __START_KERNEL_map)
+                global_flush_tlb();
+}
+void free_initmem(void)
+{
+        free_init_pages("unused kernel memory",
+                        (unsigned long)(&__init_begin),
+                        (unsigned long)(&__init_end));
+}
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void)
+{
+        unsigned long start = (unsigned long)_stext, end;
+#ifdef CONFIG_HOTPLUG_CPU
+        /* It must still be possible to apply SMP alternatives. */
+        if (num_possible_cpus() > 1)
+                start = (unsigned long)_etext;
+#endif
+#ifdef CONFIG_KPROBES
+        start = (unsigned long)__start_rodata;
+#endif
+        
+        end = (unsigned long)__end_rodata;
+        start = (start + PAGE_SIZE - 1) & PAGE_MASK;
+        end &= PAGE_MASK;
+        if (end <= start)
+                return;
+        change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
+        printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+               (end - start) >> 10);
+        /*
+         * change_page_attr_addr() requires a global_flush_tlb() call after it.
+         * We do this after the printk so that if something went wrong in the
+         * change, the printk gets out at least to give a better debug hint
+         * of who is the culprit.
+         */
+        global_flush_tlb();
+}
+#endif
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+        free_init_pages("initrd memory", start, end);
+}
+#endif
+void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
+{ 
+#ifdef CONFIG_NUMA
+        int nid = phys_to_nid(phys);
+#endif
+        unsigned long pfn = phys >> PAGE_SHIFT;
+        if (pfn >= end_pfn) {
+                /* This can happen with kdump kernels when accessing firmware
+                   tables. */
+                if (pfn < end_pfn_map)
+                        return;
+                printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
+                                phys, len);
+                return;
+        }
+        /* Should check here against the e820 map to avoid double free */
+#ifdef CONFIG_NUMA
+        reserve_bootmem_node(NODE_DATA(nid), phys, len);
+#else                   
+        reserve_bootmem(phys, len);    
+#endif
+        if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
+                dma_reserve += len / PAGE_SIZE;
+                set_dma_reserve(dma_reserve);
+        }
+}
+int kern_addr_valid(unsigned long addr) 
+{ 
+        unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+        if (above != 0 && above != -1UL)
+                return 0; 
+        
+        pgd = pgd_offset_k(addr);
+        if (pgd_none(*pgd))
+                return 0;
+        pud = pud_offset(pgd, addr);
+        if (pud_none(*pud))
+                return 0; 
+        pmd = pmd_offset(pud, addr);
+        if (pmd_none(*pmd))
+                return 0;
+        if (pmd_large(*pmd))
+                return pfn_valid(pmd_pfn(*pmd));
+        pte = pte_offset_kernel(pmd, addr);
+        if (pte_none(*pte))
+                return 0;
+        return pfn_valid(pte_pfn(*pte));
+}
+/* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
+   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+   not need special handling anymore. */
+static struct vm_area_struct gate_vma = {
+        .vm_start = VSYSCALL_START,
+        .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
+        .vm_page_prot = PAGE_READONLY_EXEC,
+        .vm_flags = VM_READ | VM_EXEC
+};
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+#ifdef CONFIG_IA32_EMULATION
+        if (test_tsk_thread_flag(tsk, TIF_IA32))
+                return NULL;
+#endif
+        return &gate_vma;
+}
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+        struct vm_area_struct *vma = get_gate_vma(task);
+        if (!vma)
+                return 0;
+        return (addr >= vma->vm_start) && (addr < vma->vm_end);
+}
+/* Use this when you have no reliable task/vma, typically from interrupt
+ * context.  It is less reliable than using the task's vma and may give
+ * false positives.
+ */
+int in_gate_area_no_task(unsigned long addr)
+{
+        return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
+}
+void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+        return __alloc_bootmem_core(pgdat->bdata, size,
+                        SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
+}
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+        if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+                return "[vdso]";
+        if (vma == &gate_vma)
+                return "[vsyscall]";
+        return NULL;
+}
diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c
new file mode 100644
index 000000000000..0b278315d737
--- /dev/null
+++ b/arch/x86/mm/ioremap_32.c
@@ -0,0 +1,274 @@
+/*
+ * arch/i386/mm/ioremap.c
+ *
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <asm/fixmap.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+#define ISA_START_ADDRESS       0xa0000
+#define ISA_END_ADDRESS         0x100000
+/*
+ * Generic mapping function (not visible outside):
+ */
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
+{
+        void __iomem * addr;
+        struct vm_struct * area;
+        unsigned long offset, last_addr;
+        pgprot_t prot;
+        /* Don't allow wraparound or zero size */
+        last_addr = phys_addr + size - 1;
+        if (!size || last_addr < phys_addr)
+                return NULL;
+        /*
+         * Don't remap the low PCI/ISA area, it's always mapped..
+         */
+        if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+                return (void __iomem *) phys_to_virt(phys_addr);
+        /*
+         * Don't allow anybody to remap normal RAM that we're using..
+         */
+        if (phys_addr <= virt_to_phys(high_memory - 1)) {
+                char *t_addr, *t_end;
+                struct page *page;
+                t_addr = __va(phys_addr);
+                t_end = t_addr + (size - 1);
+           
+                for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
+                        if(!PageReserved(page))
+                                return NULL;
+        }
+        prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY
+                        | _PAGE_ACCESSED | flags);
+        /*
+         * Mappings have to be page-aligned
+         */
+        offset = phys_addr & ~PAGE_MASK;
+        phys_addr &= PAGE_MASK;
+        size = PAGE_ALIGN(last_addr+1) - phys_addr;
+        /*
+         * Ok, go for it..
+         */
+        area = get_vm_area(size, VM_IOREMAP | (flags << 20));
+        if (!area)
+                return NULL;
+        area->phys_addr = phys_addr;
+        addr = (void __iomem *) area->addr;
+        if (ioremap_page_range((unsigned long) addr,
+                        (unsigned long) addr + size, phys_addr, prot)) {
+                vunmap((void __force *) addr);
+                return NULL;
+        }
+        return (void __iomem *) (offset + (char __iomem *)addr);
+}
+EXPORT_SYMBOL(__ioremap);
+/**
+ * ioremap_nocache     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address. 
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many 
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ * 
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
+{
+        unsigned long last_addr;
+        void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
+        if (!p) 
+                return p; 
+        /* Guaranteed to be > phys_addr, as per __ioremap() */
+        last_addr = phys_addr + size - 1;
+        if (last_addr < virt_to_phys(high_memory) - 1) {
+                struct page *ppage = virt_to_page(__va(phys_addr));             
+                unsigned long npages;
+                phys_addr &= PAGE_MASK;
+                /* This might overflow and become zero.. */
+                last_addr = PAGE_ALIGN(last_addr);
+                /* .. but that's ok, because modulo-2**n arithmetic will make
+                * the page-aligned "last - first" come out right.
+                */
+                npages = (last_addr - phys_addr) >> PAGE_SHIFT;
+                if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { 
+                        iounmap(p); 
+                        p = NULL;
+                }
+                global_flush_tlb();
+        }
+        return p;                                       
+}
+EXPORT_SYMBOL(ioremap_nocache);
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+        struct vm_struct *p, *o;
+        if ((void __force *)addr <= high_memory)
+                return;
+        /*
+         * __ioremap special-cases the PCI/ISA range by not instantiating a
+         * vm_area and by simply returning an address into the kernel mapping
+         * of ISA space.   So handle that here.
+         */
+        if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
+                        addr < phys_to_virt(ISA_END_ADDRESS))
+                return;
+        addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
+        /* Use the vm area unlocked, assuming the caller
+           ensures there isn't another iounmap for the same address
+           in parallel. Reuse of the virtual address is prevented by
+           leaving it in the global lists until we're done with it.
+           cpa takes care of the direct mappings. */
+        read_lock(&vmlist_lock);
+        for (p = vmlist; p; p = p->next) {
+                if (p->addr == addr)
+                        break;
+        }
+        read_unlock(&vmlist_lock);
+        if (!p) {
+                printk("iounmap: bad address %p\n", addr);
+                dump_stack();
+                return;
+        }
+        /* Reset the direct mapping. Can block */
+        if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
+                change_page_attr(virt_to_page(__va(p->phys_addr)),
+                                 get_vm_area_size(p) >> PAGE_SHIFT,
+                                 PAGE_KERNEL);
+                global_flush_tlb();
+        } 
+        /* Finally remove it */
+        o = remove_vm_area((void *)addr);
+        BUG_ON(p != o || o == NULL);
+        kfree(p); 
+}
+EXPORT_SYMBOL(iounmap);
+void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
+{
+        unsigned long offset, last_addr;
+        unsigned int nrpages;
+        enum fixed_addresses idx;
+        /* Don't allow wraparound or zero size */
+        last_addr = phys_addr + size - 1;
+        if (!size || last_addr < phys_addr)
+                return NULL;
+        /*
+         * Don't remap the low PCI/ISA area, it's always mapped..
+         */
+        if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+                return phys_to_virt(phys_addr);
+        /*
+         * Mappings have to be page-aligned
+         */
+        offset = phys_addr & ~PAGE_MASK;
+        phys_addr &= PAGE_MASK;
+        size = PAGE_ALIGN(last_addr) - phys_addr;
+        /*
+         * Mappings have to fit in the FIX_BTMAP area.
+         */
+        nrpages = size >> PAGE_SHIFT;
+        if (nrpages > NR_FIX_BTMAPS)
+                return NULL;
+        /*
+         * Ok, go for it..
+         */
+        idx = FIX_BTMAP_BEGIN;
+        while (nrpages > 0) {
+                set_fixmap(idx, phys_addr);
+                phys_addr += PAGE_SIZE;
+                --idx;
+                --nrpages;
+        }
+        return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
+}
+void __init bt_iounmap(void *addr, unsigned long size)
+{
+        unsigned long virt_addr;
+        unsigned long offset;
+        unsigned int nrpages;
+        enum fixed_addresses idx;
+        virt_addr = (unsigned long)addr;
+        if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
+                return;
+        offset = virt_addr & ~PAGE_MASK;
+        nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
+        idx = FIX_BTMAP_BEGIN;
+        while (nrpages > 0) {
+                clear_fixmap(idx);
+                --idx;
+                --nrpages;
+        }
+}
diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c
new file mode 100644
index 000000000000..6cac90aa5032
--- /dev/null
+++ b/arch/x86/mm/ioremap_64.c
@@ -0,0 +1,210 @@
+/*
+ * arch/x86_64/mm/ioremap.c
+ *
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/tlbflush.h>
+#include <asm/cacheflush.h>
+#include <asm/proto.h>
+unsigned long __phys_addr(unsigned long x)
+{
+        if (x >= __START_KERNEL_map)
+                return x - __START_KERNEL_map + phys_base;
+        return x - PAGE_OFFSET;
+}
+EXPORT_SYMBOL(__phys_addr);
+#define ISA_START_ADDRESS      0xa0000
+#define ISA_END_ADDRESS                0x100000
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+static int
+ioremap_change_attr(unsigned long phys_addr, unsigned long size,
+                                        unsigned long flags)
+{
+        int err = 0;
+        if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
+                unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+                unsigned long vaddr = (unsigned long) __va(phys_addr);
+                /*
+                 * Must use a address here and not struct page because the phys addr
+                 * can be a in hole between nodes and not have an memmap entry.
+                 */
+                err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
+                if (!err)
+                        global_flush_tlb();
+        }
+        return err;
+}
+/*
+ * Generic mapping function
+ */
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
+{
+        void * addr;
+        struct vm_struct * area;
+        unsigned long offset, last_addr;
+        pgprot_t pgprot;
+        /* Don't allow wraparound or zero size */
+        last_addr = phys_addr + size - 1;
+        if (!size || last_addr < phys_addr)
+                return NULL;
+        /*
+         * Don't remap the low PCI/ISA area, it's always mapped..
+         */
+        if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+                return (__force void __iomem *)phys_to_virt(phys_addr);
+#ifdef CONFIG_FLATMEM
+        /*
+         * Don't allow anybody to remap normal RAM that we're using..
+         */
+        if (last_addr < virt_to_phys(high_memory)) {
+                char *t_addr, *t_end;
+                struct page *page;
+                t_addr = __va(phys_addr);
+                t_end = t_addr + (size - 1);
+           
+                for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
+                        if(!PageReserved(page))
+                                return NULL;
+        }
+#endif
+        pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL
+                          | _PAGE_DIRTY | _PAGE_ACCESSED | flags);
+        /*
+         * Mappings have to be page-aligned
+         */
+        offset = phys_addr & ~PAGE_MASK;
+        phys_addr &= PAGE_MASK;
+        size = PAGE_ALIGN(last_addr+1) - phys_addr;
+        /*
+         * Ok, go for it..
+         */
+        area = get_vm_area(size, VM_IOREMAP | (flags << 20));
+        if (!area)
+                return NULL;
+        area->phys_addr = phys_addr;
+        addr = area->addr;
+        if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
+                               phys_addr, pgprot)) {
+                remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+                return NULL;
+        }
+        if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) {
+                area->flags &= 0xffffff;
+                vunmap(addr);
+                return NULL;
+        }
+        return (__force void __iomem *) (offset + (char *)addr);
+}
+EXPORT_SYMBOL(__ioremap);
+/**
+ * ioremap_nocache     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address. 
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many 
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ * 
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
+{
+        return __ioremap(phys_addr, size, _PAGE_PCD);
+}
+EXPORT_SYMBOL(ioremap_nocache);
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+        struct vm_struct *p, *o;
+        if (addr <= high_memory) 
+                return; 
+        if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
+                addr < phys_to_virt(ISA_END_ADDRESS))
+                return;
+        addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
+        /* Use the vm area unlocked, assuming the caller
+           ensures there isn't another iounmap for the same address
+           in parallel. Reuse of the virtual address is prevented by
+           leaving it in the global lists until we're done with it.
+           cpa takes care of the direct mappings. */
+        read_lock(&vmlist_lock);
+        for (p = vmlist; p; p = p->next) {
+                if (p->addr == addr)
+                        break;
+        }
+        read_unlock(&vmlist_lock);
+        if (!p) {
+                printk("iounmap: bad address %p\n", addr);
+                dump_stack();
+                return;
+        }
+        /* Reset the direct mapping. Can block */
+        if (p->flags >> 20)
+                ioremap_change_attr(p->phys_addr, p->size, 0);
+        /* Finally remove it */
+        o = remove_vm_area((void *)addr);
+        BUG_ON(p != o || o == NULL);
+        kfree(p); 
+}
+EXPORT_SYMBOL(iounmap);
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
new file mode 100644
index 000000000000..a96006f7ae0c
--- /dev/null
+++ b/arch/x86/mm/k8topology_64.c
@@ -0,0 +1,182 @@
+/* 
+ * AMD K8 NUMA support.
+ * Discover the memory map and associated nodes.
+ * 
+ * This version reads it directly from the K8 northbridge.
+ * 
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <asm/io.h>
+#include <linux/pci_ids.h>
+#include <asm/types.h>
+#include <asm/mmzone.h>
+#include <asm/proto.h>
+#include <asm/e820.h>
+#include <asm/pci-direct.h>
+#include <asm/numa.h>
+static __init int find_northbridge(void)
+{
+        int num; 
+        for (num = 0; num < 32; num++) { 
+                u32 header;
+                
+                header = read_pci_config(0, num, 0, 0x00);  
+                if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)))
+                        continue;       
+                header = read_pci_config(0, num, 1, 0x00); 
+                if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)))
+                        continue;       
+                return num; 
+        } 
+        return -1;      
+}
+int __init k8_scan_nodes(unsigned long start, unsigned long end)
+{ 
+        unsigned long prevbase;
+        struct bootnode nodes[8];
+        int nodeid, i, j, nb;
+        unsigned char nodeids[8];
+        int found = 0;
+        u32 reg;
+        unsigned numnodes;
+        unsigned num_cores;
+        if (!early_pci_allowed())
+                return -1;
+        nb = find_northbridge(); 
+        if (nb < 0) 
+                return nb;
+        printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 
+        num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
+        printk(KERN_INFO "CPU has %d num_cores\n", num_cores);
+        reg = read_pci_config(0, nb, 0, 0x60); 
+        numnodes = ((reg >> 4) & 0xF) + 1;
+        if (numnodes <= 1)
+                return -1;
+        printk(KERN_INFO "Number of nodes %d\n", numnodes);
+        memset(&nodes,0,sizeof(nodes)); 
+        prevbase = 0;
+        for (i = 0; i < 8; i++) { 
+                unsigned long base,limit; 
+                u32 nodeid;
+                
+                base = read_pci_config(0, nb, 1, 0x40 + i*8);
+                limit = read_pci_config(0, nb, 1, 0x44 + i*8);
+                nodeid = limit & 7; 
+                nodeids[i] = nodeid;
+                if ((base & 3) == 0) { 
+                        if (i < numnodes)
+                                printk("Skipping disabled node %d\n", i); 
+                        continue;
+                } 
+                if (nodeid >= numnodes) {
+                        printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+                               base, limit); 
+                        continue;
+                } 
+                if (!limit) { 
+                        printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i,
+                               base);
+                        continue;
+                }
+                if ((base >> 8) & 3 || (limit >> 8) & 3) {
+                        printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 
+                               nodeid, (base>>8)&3, (limit>>8) & 3); 
+                        return -1; 
+                }       
+                if (node_isset(nodeid, node_possible_map)) {
+                        printk(KERN_INFO "Node %d already present. Skipping\n", 
+                               nodeid);
+                        continue;
+                }
+                limit >>= 16; 
+                limit <<= 24; 
+                limit |= (1<<24)-1;
+                limit++;
+                if (limit > end_pfn << PAGE_SHIFT)
+                        limit = end_pfn << PAGE_SHIFT;
+                if (limit <= base)
+                        continue; 
+                        
+                base >>= 16;
+                base <<= 24; 
+                if (base < start) 
+                        base = start; 
+                if (limit > end) 
+                        limit = end; 
+                if (limit == base) { 
+                        printk(KERN_ERR "Empty node %d\n", nodeid); 
+                        continue; 
+                }
+                if (limit < base) { 
+                        printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
+                               nodeid, base, limit);                           
+                        continue;
+                } 
+                
+                /* Could sort here, but pun for now. Should not happen anyroads. */
+                if (prevbase > base) { 
+                        printk(KERN_ERR "Node map not sorted %lx,%lx\n",
+                               prevbase,base);
+                        return -1;
+                }
+                        
+                printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 
+                       nodeid, base, limit); 
+                
+                found++;
+                
+                nodes[nodeid].start = base; 
+                nodes[nodeid].end = limit;
+                e820_register_active_regions(nodeid,
+                                nodes[nodeid].start >> PAGE_SHIFT,
+                                nodes[nodeid].end >> PAGE_SHIFT);
+                prevbase = base;
+                node_set(nodeid, node_possible_map);
+        } 
+        if (!found)
+                return -1; 
+        memnode_shift = compute_hash_shift(nodes, 8);
+        if (memnode_shift < 0) { 
+                printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 
+                return -1; 
+        } 
+        printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 
+        for (i = 0; i < 8; i++) {
+                if (nodes[i].start != nodes[i].end) { 
+                        nodeid = nodeids[i];
+                        for (j = 0; j < num_cores; j++)
+                                apicid_to_node[(nodeid * num_cores) + j] = i;
+                        setup_node_bootmem(i, nodes[i].start, nodes[i].end); 
+                } 
+        }
+        numa_init_array();
+        return 0;
+} 
diff --git a/arch/x86/mm/mmap_32.c b/arch/x86/mm/mmap_32.c
new file mode 100644
index 000000000000..552e08473755
--- /dev/null
+++ b/arch/x86/mm/mmap_32.c
@@ -0,0 +1,77 @@
+/*
+ *  linux/arch/i386/mm/mmap.c
+ *
+ *  flexible mmap layout support
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * Started by Ingo Molnar <mingo@elte.hu>
+ */
+#include <linux/personality.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+/*
+ * Top of mmap area (just below the process stack).
+ *
+ * Leave an at least ~128 MB hole.
+ */
+#define MIN_GAP (128*1024*1024)
+#define MAX_GAP (TASK_SIZE/6*5)
+static inline unsigned long mmap_base(struct mm_struct *mm)
+{
+        unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+        unsigned long random_factor = 0;
+        if (current->flags & PF_RANDOMIZE)
+                random_factor = get_random_int() % (1024*1024);
+        if (gap < MIN_GAP)
+                gap = MIN_GAP;
+        else if (gap > MAX_GAP)
+                gap = MAX_GAP;
+        return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
+}
+/*
+ * This function, called very early during the creation of a new
+ * process VM image, sets up which VM layout function to use:
+ */
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+        /*
+         * Fall back to the standard layout if the personality
+         * bit is set, or if the expected stack growth is unlimited:
+         */
+        if (sysctl_legacy_va_layout ||
+                        (current->personality & ADDR_COMPAT_LAYOUT) ||
+                        current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
+                mm->mmap_base = TASK_UNMAPPED_BASE;
+                mm->get_unmapped_area = arch_get_unmapped_area;
+                mm->unmap_area = arch_unmap_area;
+        } else {
+                mm->mmap_base = mmap_base(mm);
+                mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+                mm->unmap_area = arch_unmap_area_topdown;
+        }
+}
diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c
new file mode 100644
index 000000000000..80bba0dc000e
--- /dev/null
+++ b/arch/x86/mm/mmap_64.c
@@ -0,0 +1,29 @@
+/* Copyright 2005 Andi Kleen, SuSE Labs.
+ * Licensed under GPL, v.2
+ */
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/random.h>
+#include <asm/ia32.h>
+/* Notebook: move the mmap code from sys_x86_64.c over here. */
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+#ifdef CONFIG_IA32_EMULATION
+        if (current_thread_info()->flags & _TIF_IA32)
+                return ia32_pick_mmap_layout(mm);
+#endif
+        mm->mmap_base = TASK_UNMAPPED_BASE;
+        if (current->flags & PF_RANDOMIZE) {
+                /* Add 28bit randomness which is about 40bits of address space
+                   because mmap base has to be page aligned.
+                   or ~1/128 of the total user VM
+                   (total user address space is 47bits) */
+                unsigned rnd = get_random_int() & 0xfffffff;
+                mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
+        }
+        mm->get_unmapped_area = arch_get_unmapped_area;
+        mm->unmap_area = arch_unmap_area;
+}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
new file mode 100644
index 000000000000..6da235522269
--- /dev/null
+++ b/arch/x86/mm/numa_64.c
@@ -0,0 +1,648 @@
+/* 
+ * Generic VM initialization for x86-64 NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */ 
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/numa.h>
+#include <asm/acpi.h>
+#ifndef Dprintk
+#define Dprintk(x...)
+#endif
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+bootmem_data_t plat_node_bdata[MAX_NUMNODES];
+struct memnode memnode;
+unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
+        [0 ... NR_CPUS-1] = NUMA_NO_NODE
+};
+unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
+int numa_off __initdata;
+unsigned long __initdata nodemap_addr;
+unsigned long __initdata nodemap_size;
+/*
+ * Given a shift value, try to populate memnodemap[]
+ * Returns :
+ * 1 if OK
+ * 0 if memnodmap[] too small (of shift too small)
+ * -1 if node overlap or lost ram (shift too big)
+ */
+static int __init
+populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
+{
+        int i; 
+        int res = -1;
+        unsigned long addr, end;
+        memset(memnodemap, 0xff, memnodemapsize);
+        for (i = 0; i < numnodes; i++) {
+                addr = nodes[i].start;
+                end = nodes[i].end;
+                if (addr >= end)
+                        continue;
+                if ((end >> shift) >= memnodemapsize)
+                        return 0;
+                do {
+                        if (memnodemap[addr >> shift] != 0xff)
+                                return -1;
+                        memnodemap[addr >> shift] = i;
+                        addr += (1UL << shift);
+                } while (addr < end);
+                res = 1;
+        } 
+        return res;
+}
+static int __init allocate_cachealigned_memnodemap(void)
+{
+        unsigned long pad, pad_addr;
+        memnodemap = memnode.embedded_map;
+        if (memnodemapsize <= 48)
+                return 0;
+        pad = L1_CACHE_BYTES - 1;
+        pad_addr = 0x8000;
+        nodemap_size = pad + memnodemapsize;
+        nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
+                                      nodemap_size);
+        if (nodemap_addr == -1UL) {
+                printk(KERN_ERR
+                       "NUMA: Unable to allocate Memory to Node hash map\n");
+                nodemap_addr = nodemap_size = 0;
+                return -1;
+        }
+        pad_addr = (nodemap_addr + pad) & ~pad;
+        memnodemap = phys_to_virt(pad_addr);
+        printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+               nodemap_addr, nodemap_addr + nodemap_size);
+        return 0;
+}
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static int __init
+extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
+{
+        int i, nodes_used = 0;
+        unsigned long start, end;
+        unsigned long bitfield = 0, memtop = 0;
+        for (i = 0; i < numnodes; i++) {
+                start = nodes[i].start;
+                end = nodes[i].end;
+                if (start >= end)
+                        continue;
+                bitfield |= start;
+                nodes_used++;
+                if (end > memtop)
+                        memtop = end;
+        }
+        if (nodes_used <= 1)
+                i = 63;
+        else
+                i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+        memnodemapsize = (memtop >> i)+1;
+        return i;
+}
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+{
+        int shift;
+        shift = extract_lsb_from_nodes(nodes, numnodes);
+        if (allocate_cachealigned_memnodemap())
+                return -1;
+        printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
+                shift);
+        if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+                printk(KERN_INFO
+        "Your memory is not aligned you need to rebuild your kernel "
+        "with a bigger NODEMAPSIZE shift=%d\n",
+                        shift);
+                return -1;
+        }
+        return shift;
+}
+#ifdef CONFIG_SPARSEMEM
+int early_pfn_to_nid(unsigned long pfn)
+{
+        return phys_to_nid(pfn << PAGE_SHIFT);
+}
+#endif
+static void * __init
+early_node_mem(int nodeid, unsigned long start, unsigned long end,
+              unsigned long size)
+{
+        unsigned long mem = find_e820_area(start, end, size);
+        void *ptr;
+        if (mem != -1L)
+                return __va(mem);
+        ptr = __alloc_bootmem_nopanic(size,
+                                SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
+        if (ptr == 0) {
+                printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
+                        size, nodeid);
+                return NULL;
+        }
+        return ptr;
+}
+/* Initialize bootmem allocator for a node */
+void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
+{ 
+        unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 
+        unsigned long nodedata_phys;
+        void *bootmap;
+        const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
+        start = round_up(start, ZONE_ALIGN); 
+        printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
+        start_pfn = start >> PAGE_SHIFT;
+        end_pfn = end >> PAGE_SHIFT;
+        node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
+        if (node_data[nodeid] == NULL)
+                return;
+        nodedata_phys = __pa(node_data[nodeid]);
+        memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
+        NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
+        NODE_DATA(nodeid)->node_start_pfn = start_pfn;
+        NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+        /* Find a place for the bootmem map */
+        bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 
+        bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+        bootmap = early_node_mem(nodeid, bootmap_start, end,
+                                        bootmap_pages<<PAGE_SHIFT);
+        if (bootmap == NULL)  {
+                if (nodedata_phys < start || nodedata_phys >= end)
+                        free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
+                node_data[nodeid] = NULL;
+                return;
+        }
+        bootmap_start = __pa(bootmap);
+        Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 
+        
+        bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+                                         bootmap_start >> PAGE_SHIFT, 
+                                         start_pfn, end_pfn); 
+        free_bootmem_with_active_regions(nodeid, end);
+        reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
+        reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+#ifdef CONFIG_ACPI_NUMA
+        srat_reserve_add_area(nodeid);
+#endif
+        node_set_online(nodeid);
+} 
+/* Initialize final allocator for a zone */
+void __init setup_node_zones(int nodeid)
+{ 
+        unsigned long start_pfn, end_pfn, memmapsize, limit;
+        start_pfn = node_start_pfn(nodeid);
+        end_pfn = node_end_pfn(nodeid);
+        Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
+                nodeid, start_pfn, end_pfn);
+        /* Try to allocate mem_map at end to not fill up precious <4GB
+           memory. */
+        memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
+        limit = end_pfn << PAGE_SHIFT;
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+        NODE_DATA(nodeid)->node_mem_map = 
+                __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
+                                memmapsize, SMP_CACHE_BYTES, 
+                                round_down(limit - memmapsize, PAGE_SIZE), 
+                                limit);
+#endif
+} 
+void __init numa_init_array(void)
+{
+        int rr, i;
+        /* There are unfortunately some poorly designed mainboards around
+           that only connect memory to a single CPU. This breaks the 1:1 cpu->node
+           mapping. To avoid this fill in the mapping for all possible
+           CPUs, as the number of CPUs is not known yet. 
+           We round robin the existing nodes. */
+        rr = first_node(node_online_map);
+        for (i = 0; i < NR_CPUS; i++) {
+                if (cpu_to_node[i] != NUMA_NO_NODE)
+                        continue;
+                numa_set_node(i, rr);
+                rr = next_node(rr, node_online_map);
+                if (rr == MAX_NUMNODES)
+                        rr = first_node(node_online_map);
+        }
+}
+#ifdef CONFIG_NUMA_EMU
+/* Numa emulation */
+char *cmdline __initdata;
+/*
+ * Setups up nid to range from addr to addr + size.  If the end boundary is
+ * greater than max_addr, then max_addr is used instead.  The return value is 0
+ * if there is additional memory left for allocation past addr and -1 otherwise.
+ * addr is adjusted to be at the end of the node.
+ */
+static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
+                                   u64 size, u64 max_addr)
+{
+        int ret = 0;
+        nodes[nid].start = *addr;
+        *addr += size;
+        if (*addr >= max_addr) {
+                *addr = max_addr;
+                ret = -1;
+        }
+        nodes[nid].end = *addr;
+        node_set(nid, node_possible_map);
+        printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
+               nodes[nid].start, nodes[nid].end,
+               (nodes[nid].end - nodes[nid].start) >> 20);
+        return ret;
+}
+/*
+ * Splits num_nodes nodes up equally starting at node_start.  The return value
+ * is the number of nodes split up and addr is adjusted to be at the end of the
+ * last node allocated.
+ */
+static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
+                                      u64 max_addr, int node_start,
+                                      int num_nodes)
+{
+        unsigned int big;
+        u64 size;
+        int i;
+        if (num_nodes <= 0)
+                return -1;
+        if (num_nodes > MAX_NUMNODES)
+                num_nodes = MAX_NUMNODES;
+        size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
+               num_nodes;
+        /*
+         * Calculate the number of big nodes that can be allocated as a result
+         * of consolidating the leftovers.
+         */
+        big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
+              FAKE_NODE_MIN_SIZE;
+        /* Round down to nearest FAKE_NODE_MIN_SIZE. */
+        size &= FAKE_NODE_MIN_HASH_MASK;
+        if (!size) {
+                printk(KERN_ERR "Not enough memory for each node.  "
+                       "NUMA emulation disabled.\n");
+                return -1;
+        }
+        for (i = node_start; i < num_nodes + node_start; i++) {
+                u64 end = *addr + size;
+                if (i < big)
+                        end += FAKE_NODE_MIN_SIZE;
+                /*
+                 * The final node can have the remaining system RAM.  Other
+                 * nodes receive roughly the same amount of available pages.
+                 */
+                if (i == num_nodes + node_start - 1)
+                        end = max_addr;
+                else
+                        while (end - *addr - e820_hole_size(*addr, end) <
+                               size) {
+                                end += FAKE_NODE_MIN_SIZE;
+                                if (end > max_addr) {
+                                        end = max_addr;
+                                        break;
+                                }
+                        }
+                if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
+                        break;
+        }
+        return i - node_start + 1;
+}
+/*
+ * Splits the remaining system RAM into chunks of size.  The remaining memory is
+ * always assigned to a final node and can be asymmetric.  Returns the number of
+ * nodes split.
+ */
+static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
+                                      u64 max_addr, int node_start, u64 size)
+{
+        int i = node_start;
+        size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
+        while (!setup_node_range(i++, nodes, addr, size, max_addr))
+                ;
+        return i - node_start;
+}
+/*
+ * Sets up the system RAM area from start_pfn to end_pfn according to the
+ * numa=fake command-line option.
+ */
+static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+{
+        struct bootnode nodes[MAX_NUMNODES];
+        u64 addr = start_pfn << PAGE_SHIFT;
+        u64 max_addr = end_pfn << PAGE_SHIFT;
+        int num_nodes = 0;
+        int coeff_flag;
+        int coeff = -1;
+        int num = 0;
+        u64 size;
+        int i;
+        memset(&nodes, 0, sizeof(nodes));
+        /*
+         * If the numa=fake command-line is just a single number N, split the
+         * system RAM into N fake nodes.
+         */
+        if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
+                num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
+                                                simple_strtol(cmdline, NULL, 0));
+                if (num_nodes < 0)
+                        return num_nodes;
+                goto out;
+        }
+        /* Parse the command line. */
+        for (coeff_flag = 0; ; cmdline++) {
+                if (*cmdline && isdigit(*cmdline)) {
+                        num = num * 10 + *cmdline - '0';
+                        continue;
+                }
+                if (*cmdline == '*') {
+                        if (num > 0)
+                                coeff = num;
+                        coeff_flag = 1;
+                }
+                if (!*cmdline || *cmdline == ',') {
+                        if (!coeff_flag)
+                                coeff = 1;
+                        /*
+                         * Round down to the nearest FAKE_NODE_MIN_SIZE.
+                         * Command-line coefficients are in megabytes.
+                         */
+                        size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
+                        if (size)
+                                for (i = 0; i < coeff; i++, num_nodes++)
+                                        if (setup_node_range(num_nodes, nodes,
+                                                &addr, size, max_addr) < 0)
+                                                goto done;
+                        if (!*cmdline)
+                                break;
+                        coeff_flag = 0;
+                        coeff = -1;
+                }
+                num = 0;
+        }
+done:
+        if (!num_nodes)
+                return -1;
+        /* Fill remainder of system RAM, if appropriate. */
+        if (addr < max_addr) {
+                if (coeff_flag && coeff < 0) {
+                        /* Split remaining nodes into num-sized chunks */
+                        num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
+                                                         num_nodes, num);
+                        goto out;
+                }
+                switch (*(cmdline - 1)) {
+                case '*':
+                        /* Split remaining nodes into coeff chunks */
+                        if (coeff <= 0)
+                                break;
+                        num_nodes += split_nodes_equally(nodes, &addr, max_addr,
+                                                         num_nodes, coeff);
+                        break;
+                case ',':
+                        /* Do not allocate remaining system RAM */
+                        break;
+                default:
+                        /* Give one final node */
+                        setup_node_range(num_nodes, nodes, &addr,
+                                         max_addr - addr, max_addr);
+                        num_nodes++;
+                }
+        }
+out:
+        memnode_shift = compute_hash_shift(nodes, num_nodes);
+        if (memnode_shift < 0) {
+                memnode_shift = 0;
+                printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
+                       "disabled.\n");
+                return -1;
+        }
+        /*
+         * We need to vacate all active ranges that may have been registered by
+         * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
+         * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
+         */
+        remove_all_active_ranges();
+#ifdef CONFIG_ACPI_NUMA
+        acpi_numa = -1;
+#endif
+        for_each_node_mask(i, node_possible_map) {
+                e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+                                                nodes[i].end >> PAGE_SHIFT);
+                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+        }
+        acpi_fake_nodes(nodes, num_nodes);
+        numa_init_array();
+        return 0;
+}
+#endif /* CONFIG_NUMA_EMU */
+void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{ 
+        int i;
+        nodes_clear(node_possible_map);
+#ifdef CONFIG_NUMA_EMU
+        if (cmdline && !numa_emulation(start_pfn, end_pfn))
+                return;
+        nodes_clear(node_possible_map);
+#endif
+#ifdef CONFIG_ACPI_NUMA
+        if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
+                                          end_pfn << PAGE_SHIFT))
+                return;
+        nodes_clear(node_possible_map);
+#endif
+#ifdef CONFIG_K8_NUMA
+        if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
+                return;
+        nodes_clear(node_possible_map);
+#endif
+        printk(KERN_INFO "%s\n",
+               numa_off ? "NUMA turned off" : "No NUMA configuration found");
+        printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
+               start_pfn << PAGE_SHIFT,
+               end_pfn << PAGE_SHIFT); 
+                /* setup dummy node covering all memory */ 
+        memnode_shift = 63; 
+        memnodemap = memnode.embedded_map;
+        memnodemap[0] = 0;
+        nodes_clear(node_online_map);
+        node_set_online(0);
+        node_set(0, node_possible_map);
+        for (i = 0; i < NR_CPUS; i++)
+                numa_set_node(i, 0);
+        node_to_cpumask[0] = cpumask_of_cpu(0);
+        e820_register_active_regions(0, start_pfn, end_pfn);
+        setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+}
+__cpuinit void numa_add_cpu(int cpu)
+{
+        set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+} 
+void __cpuinit numa_set_node(int cpu, int node)
+{
+        cpu_pda(cpu)->nodenumber = node;
+        cpu_to_node[cpu] = node;
+}
+unsigned long __init numa_free_all_bootmem(void) 
+{ 
+        int i;
+        unsigned long pages = 0;
+        for_each_online_node(i) {
+                pages += free_all_bootmem_node(NODE_DATA(i));
+        }
+        return pages;
+} 
+void __init paging_init(void)
+{ 
+        int i;
+        unsigned long max_zone_pfns[MAX_NR_ZONES];
+        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+        max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+        max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+        max_zone_pfns[ZONE_NORMAL] = end_pfn;
+        sparse_memory_present_with_active_regions(MAX_NUMNODES);
+        sparse_init();
+        for_each_online_node(i) {
+                setup_node_zones(i); 
+        }
+        free_area_init_nodes(max_zone_pfns);
+} 
+static __init int numa_setup(char *opt)
+{ 
+        if (!opt)
+                return -EINVAL;
+        if (!strncmp(opt,"off",3))
+                numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+        if (!strncmp(opt, "fake=", 5))
+                cmdline = opt + 5;
+#endif
+#ifdef CONFIG_ACPI_NUMA
+        if (!strncmp(opt,"noacpi",6))
+                acpi_numa = -1;
+        if (!strncmp(opt,"hotadd=", 7))
+                hotadd_percent = simple_strtoul(opt+7, NULL, 10);
+#endif
+        return 0;
+} 
+early_param("numa", numa_setup);
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ */
+void __init init_cpu_to_node(void)
+{
+        int i;
+        for (i = 0; i < NR_CPUS; i++) {
+                u8 apicid = x86_cpu_to_apicid[i];
+                if (apicid == BAD_APICID)
+                        continue;
+                if (apicid_to_node[apicid] == NUMA_NO_NODE)
+                        continue;
+                numa_set_node(i,apicid_to_node[apicid]);
+        }
+}
+EXPORT_SYMBOL(cpu_to_node);
+EXPORT_SYMBOL(node_to_cpumask);
+EXPORT_SYMBOL(memnode);
+EXPORT_SYMBOL(node_data);
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * Functions to convert PFNs from/to per node page addresses.
+ * These are out of line because they are quite big.
+ * They could be all tuned by pre caching more state.
+ * Should do that.
+ */
+int pfn_valid(unsigned long pfn)
+{
+        unsigned nid;
+        if (pfn >= num_physpages)
+                return 0;
+        nid = pfn_to_nid(pfn);
+        if (nid == 0xff)
+                return 0;
+        return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
+}
+EXPORT_SYMBOL(pfn_valid);
+#endif
diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c
new file mode 100644
index 000000000000..4241a74d16c8
--- /dev/null
+++ b/arch/x86/mm/pageattr_32.c
@@ -0,0 +1,278 @@
+/* 
+ * Copyright 2002 Andi Kleen, SuSE Labs. 
+ * Thanks to Ben LaHaise for precious feedback.
+ */ 
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/sections.h>
+static DEFINE_SPINLOCK(cpa_lock);
+static struct list_head df_list = LIST_HEAD_INIT(df_list);
+pte_t *lookup_address(unsigned long address) 
+{ 
+        pgd_t *pgd = pgd_offset_k(address);
+        pud_t *pud;
+        pmd_t *pmd;
+        if (pgd_none(*pgd))
+                return NULL;
+        pud = pud_offset(pgd, address);
+        if (pud_none(*pud))
+                return NULL;
+        pmd = pmd_offset(pud, address);
+        if (pmd_none(*pmd))
+                return NULL;
+        if (pmd_large(*pmd))
+                return (pte_t *)pmd;
+        return pte_offset_kernel(pmd, address);
+} 
+static struct page *split_large_page(unsigned long address, pgprot_t prot,
+                                        pgprot_t ref_prot)
+{ 
+        int i; 
+        unsigned long addr;
+        struct page *base;
+        pte_t *pbase;
+        spin_unlock_irq(&cpa_lock);
+        base = alloc_pages(GFP_KERNEL, 0);
+        spin_lock_irq(&cpa_lock);
+        if (!base) 
+                return NULL;
+        /*
+         * page_private is used to track the number of entries in
+         * the page table page that have non standard attributes.
+         */
+        SetPagePrivate(base);
+        page_private(base) = 0;
+        address = __pa(address);
+        addr = address & LARGE_PAGE_MASK; 
+        pbase = (pte_t *)page_address(base);
+        paravirt_alloc_pt(&init_mm, page_to_pfn(base));
+        for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
+               set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
+                                          addr == address ? prot : ref_prot));
+        }
+        return base;
+} 
+static void cache_flush_page(struct page *p)
+{ 
+        unsigned long adr = (unsigned long)page_address(p);
+        int i;
+        for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
+                asm volatile("clflush (%0)" :: "r" (adr + i));
+}
+static void flush_kernel_map(void *arg)
+{
+        struct list_head *lh = (struct list_head *)arg;
+        struct page *p;
+        /* High level code is not ready for clflush yet */
+        if (0 && cpu_has_clflush) {
+                list_for_each_entry (p, lh, lru)
+                        cache_flush_page(p);
+        } else if (boot_cpu_data.x86_model >= 4)
+                wbinvd();
+        /* Flush all to work around Errata in early athlons regarding 
+         * large page flushing. 
+         */
+        __flush_tlb_all();      
+}
+static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 
+{ 
+        struct page *page;
+        unsigned long flags;
+        set_pte_atomic(kpte, pte);      /* change init_mm */
+        if (SHARED_KERNEL_PMD)
+                return;
+        spin_lock_irqsave(&pgd_lock, flags);
+        for (page = pgd_list; page; page = (struct page *)page->index) {
+                pgd_t *pgd;
+                pud_t *pud;
+                pmd_t *pmd;
+                pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                pud = pud_offset(pgd, address);
+                pmd = pmd_offset(pud, address);
+                set_pte_atomic((pte_t *)pmd, pte);
+        }
+        spin_unlock_irqrestore(&pgd_lock, flags);
+}
+/* 
+ * No more special protections in this 2/4MB area - revert to a
+ * large page again. 
+ */
+static inline void revert_page(struct page *kpte_page, unsigned long address)
+{
+        pgprot_t ref_prot;
+        pte_t *linear;
+        ref_prot =
+        ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
+                ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
+        linear = (pte_t *)
+                pmd_offset(pud_offset(pgd_offset_k(address), address), address);
+        set_pmd_pte(linear,  address,
+                    pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
+                            ref_prot));
+}
+static inline void save_page(struct page *kpte_page)
+{
+        if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
+                list_add(&kpte_page->lru, &df_list);
+}
+static int
+__change_page_attr(struct page *page, pgprot_t prot)
+{ 
+        pte_t *kpte; 
+        unsigned long address;
+        struct page *kpte_page;
+        BUG_ON(PageHighMem(page));
+        address = (unsigned long)page_address(page);
+        kpte = lookup_address(address);
+        if (!kpte)
+                return -EINVAL;
+        kpte_page = virt_to_page(kpte);
+        BUG_ON(PageLRU(kpte_page));
+        BUG_ON(PageCompound(kpte_page));
+        if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 
+                if (!pte_huge(*kpte)) {
+                        set_pte_atomic(kpte, mk_pte(page, prot)); 
+                } else {
+                        pgprot_t ref_prot;
+                        struct page *split;
+                        ref_prot =
+                        ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
+                                ? PAGE_KERNEL_EXEC : PAGE_KERNEL;
+                        split = split_large_page(address, prot, ref_prot);
+                        if (!split)
+                                return -ENOMEM;
+                        set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
+                        kpte_page = split;
+                }
+                page_private(kpte_page)++;
+        } else if (!pte_huge(*kpte)) {
+                set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
+                BUG_ON(page_private(kpte_page) == 0);
+                page_private(kpte_page)--;
+        } else
+                BUG();
+        /*
+         * If the pte was reserved, it means it was created at boot
+         * time (not via split_large_page) and in turn we must not
+         * replace it with a largepage.
+         */
+        save_page(kpte_page);
+        if (!PageReserved(kpte_page)) {
+                if (cpu_has_pse && (page_private(kpte_page) == 0)) {
+                        paravirt_release_pt(page_to_pfn(kpte_page));
+                        revert_page(kpte_page, address);
+                }
+        }
+        return 0;
+} 
+static inline void flush_map(struct list_head *l)
+{
+        on_each_cpu(flush_kernel_map, l, 1, 1);
+}
+/*
+ * Change the page attributes of an page in the linear mapping.
+ *
+ * This should be used when a page is mapped with a different caching policy
+ * than write-back somewhere - some CPUs do not like it when mappings with
+ * different caching policies exist. This changes the page attributes of the
+ * in kernel linear mapping too.
+ * 
+ * The caller needs to ensure that there are no conflicting mappings elsewhere.
+ * This function only deals with the kernel linear map.
+ * 
+ * Caller must call global_flush_tlb() after this.
+ */
+int change_page_attr(struct page *page, int numpages, pgprot_t prot)
+{
+        int err = 0; 
+        int i; 
+        unsigned long flags;
+        spin_lock_irqsave(&cpa_lock, flags);
+        for (i = 0; i < numpages; i++, page++) { 
+                err = __change_page_attr(page, prot);
+                if (err) 
+                        break; 
+        }       
+        spin_unlock_irqrestore(&cpa_lock, flags);
+        return err;
+}
+void global_flush_tlb(void)
+{
+        struct list_head l;
+        struct page *pg, *next;
+        BUG_ON(irqs_disabled());
+        spin_lock_irq(&cpa_lock);
+        list_replace_init(&df_list, &l);
+        spin_unlock_irq(&cpa_lock);
+        flush_map(&l);
+        list_for_each_entry_safe(pg, next, &l, lru) {
+                list_del(&pg->lru);
+                clear_bit(PG_arch_1, &pg->flags);
+                if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
+                        continue;
+                ClearPagePrivate(pg);
+                __free_page(pg);
+        }
+}
+#ifdef CONFIG_DEBUG_PAGEALLOC
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+        if (PageHighMem(page))
+                return;
+        if (!enable)
+                debug_check_no_locks_freed(page_address(page),
+                                           numpages * PAGE_SIZE);
+        /* the return value is ignored - the calls cannot fail,
+         * large pages are disabled at boot time.
+         */
+        change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
+        /* we should perform an IPI and flush all tlbs,
+         * but that can deadlock->flush only current cpu.
+         */
+        __flush_tlb_all();
+}
+#endif
+EXPORT_SYMBOL(change_page_attr);
+EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c
new file mode 100644
index 000000000000..10b9809ce821
--- /dev/null
+++ b/arch/x86/mm/pageattr_64.c
@@ -0,0 +1,249 @@
+/* 
+ * Copyright 2002 Andi Kleen, SuSE Labs. 
+ * Thanks to Ben LaHaise for precious feedback.
+ */ 
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+pte_t *lookup_address(unsigned long address)
+{ 
+        pgd_t *pgd = pgd_offset_k(address);
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        if (pgd_none(*pgd))
+                return NULL;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                return NULL; 
+        pmd = pmd_offset(pud, address);
+        if (!pmd_present(*pmd))
+                return NULL; 
+        if (pmd_large(*pmd))
+                return (pte_t *)pmd;
+        pte = pte_offset_kernel(pmd, address);
+        if (pte && !pte_present(*pte))
+                pte = NULL; 
+        return pte;
+} 
+static struct page *split_large_page(unsigned long address, pgprot_t prot,
+                                     pgprot_t ref_prot)
+{ 
+        int i; 
+        unsigned long addr;
+        struct page *base = alloc_pages(GFP_KERNEL, 0);
+        pte_t *pbase;
+        if (!base) 
+                return NULL;
+        /*
+         * page_private is used to track the number of entries in
+         * the page table page have non standard attributes.
+         */
+        SetPagePrivate(base);
+        page_private(base) = 0;
+        address = __pa(address);
+        addr = address & LARGE_PAGE_MASK; 
+        pbase = (pte_t *)page_address(base);
+        for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
+                pbase[i] = pfn_pte(addr >> PAGE_SHIFT, 
+                                   addr == address ? prot : ref_prot);
+        }
+        return base;
+} 
+static void cache_flush_page(void *adr)
+{
+        int i;
+        for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
+                asm volatile("clflush (%0)" :: "r" (adr + i));
+}
+static void flush_kernel_map(void *arg)
+{
+        struct list_head *l = (struct list_head *)arg;
+        struct page *pg;
+        /* When clflush is available always use it because it is
+           much cheaper than WBINVD. */
+        /* clflush is still broken. Disable for now. */
+        if (1 || !cpu_has_clflush)
+                asm volatile("wbinvd" ::: "memory");
+        else list_for_each_entry(pg, l, lru) {
+                void *adr = page_address(pg);
+                cache_flush_page(adr);
+        }
+        __flush_tlb_all();
+}
+static inline void flush_map(struct list_head *l)
+{       
+        on_each_cpu(flush_kernel_map, l, 1, 1);
+}
+static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
+static inline void save_page(struct page *fpage)
+{
+        if (!test_and_set_bit(PG_arch_1, &fpage->flags))
+                list_add(&fpage->lru, &deferred_pages);
+}
+/* 
+ * No more special protections in this 2/4MB area - revert to a
+ * large page again. 
+ */
+static void revert_page(unsigned long address, pgprot_t ref_prot)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t large_pte;
+        unsigned long pfn;
+        pgd = pgd_offset_k(address);
+        BUG_ON(pgd_none(*pgd));
+        pud = pud_offset(pgd,address);
+        BUG_ON(pud_none(*pud));
+        pmd = pmd_offset(pud, address);
+        BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
+        pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
+        large_pte = pfn_pte(pfn, ref_prot);
+        large_pte = pte_mkhuge(large_pte);
+        set_pte((pte_t *)pmd, large_pte);
+}      
+static int
+__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
+                                   pgprot_t ref_prot)
+{ 
+        pte_t *kpte; 
+        struct page *kpte_page;
+        pgprot_t ref_prot2;
+        kpte = lookup_address(address);
+        if (!kpte) return 0;
+        kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
+        BUG_ON(PageLRU(kpte_page));
+        BUG_ON(PageCompound(kpte_page));
+        if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
+                if (!pte_huge(*kpte)) {
+                        set_pte(kpte, pfn_pte(pfn, prot));
+                } else {
+                        /*
+                         * split_large_page will take the reference for this
+                         * change_page_attr on the split page.
+                         */
+                        struct page *split;
+                        ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
+                        split = split_large_page(address, prot, ref_prot2);
+                        if (!split)
+                                return -ENOMEM;
+                        set_pte(kpte, mk_pte(split, ref_prot2));
+                        kpte_page = split;
+                }
+                page_private(kpte_page)++;
+        } else if (!pte_huge(*kpte)) {
+                set_pte(kpte, pfn_pte(pfn, ref_prot));
+                BUG_ON(page_private(kpte_page) == 0);
+                page_private(kpte_page)--;
+        } else
+                BUG();
+        /* on x86-64 the direct mapping set at boot is not using 4k pages */
+        BUG_ON(PageReserved(kpte_page));
+        save_page(kpte_page);
+        if (page_private(kpte_page) == 0)
+                revert_page(address, ref_prot);
+        return 0;
+} 
+/*
+ * Change the page attributes of an page in the linear mapping.
+ *
+ * This should be used when a page is mapped with a different caching policy
+ * than write-back somewhere - some CPUs do not like it when mappings with
+ * different caching policies exist. This changes the page attributes of the
+ * in kernel linear mapping too.
+ * 
+ * The caller needs to ensure that there are no conflicting mappings elsewhere.
+ * This function only deals with the kernel linear map.
+ * 
+ * Caller must call global_flush_tlb() after this.
+ */
+int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
+{
+        int err = 0, kernel_map = 0;
+        int i; 
+        if (address >= __START_KERNEL_map
+            && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
+                address = (unsigned long)__va(__pa(address));
+                kernel_map = 1;
+        }
+        down_write(&init_mm.mmap_sem);
+        for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
+                unsigned long pfn = __pa(address) >> PAGE_SHIFT;
+                if (!kernel_map || pte_present(pfn_pte(0, prot))) {
+                        err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
+                        if (err)
+                                break;
+                }
+                /* Handle kernel mapping too which aliases part of the
+                 * lowmem */
+                if (__pa(address) < KERNEL_TEXT_SIZE) {
+                        unsigned long addr2;
+                        pgprot_t prot2;
+                        addr2 = __START_KERNEL_map + __pa(address);
+                        /* Make sure the kernel mappings stay executable */
+                        prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
+                        err = __change_page_attr(addr2, pfn, prot2,
+                                                 PAGE_KERNEL_EXEC);
+                } 
+        }       
+        up_write(&init_mm.mmap_sem); 
+        return err;
+}
+/* Don't call this for MMIO areas that may not have a mem_map entry */
+int change_page_attr(struct page *page, int numpages, pgprot_t prot)
+{
+        unsigned long addr = (unsigned long)page_address(page);
+        return change_page_attr_addr(addr, numpages, prot);
+}
+void global_flush_tlb(void)
+{ 
+        struct page *pg, *next;
+        struct list_head l;
+        down_read(&init_mm.mmap_sem);
+        list_replace_init(&deferred_pages, &l);
+        up_read(&init_mm.mmap_sem);
+        flush_map(&l);
+        list_for_each_entry_safe(pg, next, &l, lru) {
+                list_del(&pg->lru);
+                clear_bit(PG_arch_1, &pg->flags);
+                if (page_private(pg) != 0)
+                        continue;
+                ClearPagePrivate(pg);
+                __free_page(pg);
+        } 
+} 
+EXPORT_SYMBOL(change_page_attr);
+EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
new file mode 100644
index 000000000000..01437c46baae
--- /dev/null
+++ b/arch/x86/mm/pgtable_32.c
@@ -0,0 +1,373 @@
+/*
+ *  linux/arch/i386/mm/pgtable.c
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/quicklist.h>
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+void show_mem(void)
+{
+        int total = 0, reserved = 0;
+        int shared = 0, cached = 0;
+        int highmem = 0;
+        struct page *page;
+        pg_data_t *pgdat;
+        unsigned long i;
+        unsigned long flags;
+        printk(KERN_INFO "Mem-info:\n");
+        show_free_areas();
+        printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+        for_each_online_pgdat(pgdat) {
+                pgdat_resize_lock(pgdat, &flags);
+                for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+                        page = pgdat_page_nr(pgdat, i);
+                        total++;
+                        if (PageHighMem(page))
+                                highmem++;
+                        if (PageReserved(page))
+                                reserved++;
+                        else if (PageSwapCache(page))
+                                cached++;
+                        else if (page_count(page))
+                                shared += page_count(page) - 1;
+                }
+                pgdat_resize_unlock(pgdat, &flags);
+        }
+        printk(KERN_INFO "%d pages of RAM\n", total);
+        printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
+        printk(KERN_INFO "%d reserved pages\n", reserved);
+        printk(KERN_INFO "%d pages shared\n", shared);
+        printk(KERN_INFO "%d pages swap cached\n", cached);
+        printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
+        printk(KERN_INFO "%lu pages writeback\n",
+                                        global_page_state(NR_WRITEBACK));
+        printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
+        printk(KERN_INFO "%lu pages slab\n",
+                global_page_state(NR_SLAB_RECLAIMABLE) +
+                global_page_state(NR_SLAB_UNRECLAIMABLE));
+        printk(KERN_INFO "%lu pages pagetables\n",
+                                        global_page_state(NR_PAGETABLE));
+}
+/*
+ * Associate a virtual page frame with a given physical page frame 
+ * and protection flags for that frame.
+ */ 
+static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        pgd = swapper_pg_dir + pgd_index(vaddr);
+        if (pgd_none(*pgd)) {
+                BUG();
+                return;
+        }
+        pud = pud_offset(pgd, vaddr);
+        if (pud_none(*pud)) {
+                BUG();
+                return;
+        }
+        pmd = pmd_offset(pud, vaddr);
+        if (pmd_none(*pmd)) {
+                BUG();
+                return;
+        }
+        pte = pte_offset_kernel(pmd, vaddr);
+        if (pgprot_val(flags))
+                /* <pfn,flags> stored as-is, to permit clearing entries */
+                set_pte(pte, pfn_pte(pfn, flags));
+        else
+                pte_clear(&init_mm, vaddr, pte);
+        /*
+         * It's enough to flush this one mapping.
+         * (PGE mappings get flushed as well)
+         */
+        __flush_tlb_one(vaddr);
+}
+/*
+ * Associate a large virtual page frame with a given physical page frame 
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned. 
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */ 
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
+                printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
+                return; /* BUG(); */
+        }
+        if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
+                printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
+                return; /* BUG(); */
+        }
+        pgd = swapper_pg_dir + pgd_index(vaddr);
+        if (pgd_none(*pgd)) {
+                printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
+                return; /* BUG(); */
+        }
+        pud = pud_offset(pgd, vaddr);
+        pmd = pmd_offset(pud, vaddr);
+        set_pmd(pmd, pfn_pmd(pfn, flags));
+        /*
+         * It's enough to flush this one mapping.
+         * (PGE mappings get flushed as well)
+         */
+        __flush_tlb_one(vaddr);
+}
+static int fixmaps;
+unsigned long __FIXADDR_TOP = 0xfffff000;
+EXPORT_SYMBOL(__FIXADDR_TOP);
+void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
+{
+        unsigned long address = __fix_to_virt(idx);
+        if (idx >= __end_of_fixed_addresses) {
+                BUG();
+                return;
+        }
+        set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+        fixmaps++;
+}
+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void reserve_top_address(unsigned long reserve)
+{
+        BUG_ON(fixmaps > 0);
+        printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
+               (int)-reserve);
+        __FIXADDR_TOP = -reserve - PAGE_SIZE;
+        __VMALLOC_RESERVE += reserve;
+}
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+        return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+}
+struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+        struct page *pte;
+#ifdef CONFIG_HIGHPTE
+        pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+        pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+        return pte;
+}
+void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
+{
+        memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
+}
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
+DEFINE_SPINLOCK(pgd_lock);
+struct page *pgd_list;
+static inline void pgd_list_add(pgd_t *pgd)
+{
+        struct page *page = virt_to_page(pgd);
+        page->index = (unsigned long)pgd_list;
+        if (pgd_list)
+                set_page_private(pgd_list, (unsigned long)&page->index);
+        pgd_list = page;
+        set_page_private(page, (unsigned long)&pgd_list);
+}
+static inline void pgd_list_del(pgd_t *pgd)
+{
+        struct page *next, **pprev, *page = virt_to_page(pgd);
+        next = (struct page *)page->index;
+        pprev = (struct page **)page_private(page);
+        *pprev = next;
+        if (next)
+                set_page_private(next, (unsigned long)pprev);
+}
+#if (PTRS_PER_PMD == 1)
+/* Non-PAE pgd constructor */
+static void pgd_ctor(void *pgd)
+{
+        unsigned long flags;
+        /* !PAE, no pagetable sharing */
+        memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+        spin_lock_irqsave(&pgd_lock, flags);
+        /* must happen under lock */
+        clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+                        swapper_pg_dir + USER_PTRS_PER_PGD,
+                        KERNEL_PGD_PTRS);
+        paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
+                                __pa(swapper_pg_dir) >> PAGE_SHIFT,
+                                USER_PTRS_PER_PGD,
+                                KERNEL_PGD_PTRS);
+        pgd_list_add(pgd);
+        spin_unlock_irqrestore(&pgd_lock, flags);
+}
+#else  /* PTRS_PER_PMD > 1 */
+/* PAE pgd constructor */
+static void pgd_ctor(void *pgd)
+{
+        /* PAE, kernel PMD may be shared */
+        if (SHARED_KERNEL_PMD) {
+                clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+                                swapper_pg_dir + USER_PTRS_PER_PGD,
+                                KERNEL_PGD_PTRS);
+        } else {
+                unsigned long flags;
+                memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+                spin_lock_irqsave(&pgd_lock, flags);
+                pgd_list_add(pgd);
+                spin_unlock_irqrestore(&pgd_lock, flags);
+        }
+}
+#endif  /* PTRS_PER_PMD */
+static void pgd_dtor(void *pgd)
+{
+        unsigned long flags; /* can be called from interrupt context */
+        if (SHARED_KERNEL_PMD)
+                return;
+        paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
+        spin_lock_irqsave(&pgd_lock, flags);
+        pgd_list_del(pgd);
+        spin_unlock_irqrestore(&pgd_lock, flags);
+}
+#define UNSHARED_PTRS_PER_PGD                           \
+        (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
+/* If we allocate a pmd for part of the kernel address space, then
+   make sure its initialized with the appropriate kernel mappings.
+   Otherwise use a cached zeroed pmd.  */
+static pmd_t *pmd_cache_alloc(int idx)
+{
+        pmd_t *pmd;
+        if (idx >= USER_PTRS_PER_PGD) {
+                pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
+                if (pmd)
+                        memcpy(pmd,
+                               (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
+                               sizeof(pmd_t) * PTRS_PER_PMD);
+        } else
+                pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+        return pmd;
+}
+static void pmd_cache_free(pmd_t *pmd, int idx)
+{
+        if (idx >= USER_PTRS_PER_PGD)
+                free_page((unsigned long)pmd);
+        else
+                kmem_cache_free(pmd_cache, pmd);
+}
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+        int i;
+        pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
+        if (PTRS_PER_PMD == 1 || !pgd)
+                return pgd;
+        for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
+                pmd_t *pmd = pmd_cache_alloc(i);
+                if (!pmd)
+                        goto out_oom;
+                paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
+                set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
+        }
+        return pgd;
+out_oom:
+        for (i--; i >= 0; i--) {
+                pgd_t pgdent = pgd[i];
+                void* pmd = (void *)__va(pgd_val(pgdent)-1);
+                paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+                pmd_cache_free(pmd, i);
+        }
+        quicklist_free(0, pgd_dtor, pgd);
+        return NULL;
+}
+void pgd_free(pgd_t *pgd)
+{
+        int i;
+        /* in the PAE case user pgd entries are overwritten before usage */
+        if (PTRS_PER_PMD > 1)
+                for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
+                        pgd_t pgdent = pgd[i];
+                        void* pmd = (void *)__va(pgd_val(pgdent)-1);
+                        paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+                        pmd_cache_free(pmd, i);
+                }
+        /* in the non-PAE case, free_pgtables() clears user pgd entries */
+        quicklist_free(0, pgd_dtor, pgd);
+}
+void check_pgt_cache(void)
+{
+        quicklist_trim(0, pgd_dtor, 25, 16);
+}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
new file mode 100644
index 000000000000..acdf03e19146
--- /dev/null
+++ b/arch/x86/mm/srat_64.c
@@ -0,0 +1,566 @@
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ */
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/mmzone.h>
+#include <linux/bitmap.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <asm/proto.h>
+#include <asm/numa.h>
+#include <asm/e820.h>
+int acpi_numa __initdata;
+static struct acpi_table_slit *acpi_slit;
+static nodemask_t nodes_parsed __initdata;
+static struct bootnode nodes[MAX_NUMNODES] __initdata;
+static struct bootnode nodes_add[MAX_NUMNODES];
+static int found_add_area __initdata;
+int hotadd_percent __initdata = 0;
+/* Too small nodes confuse the VM badly. Usually they result
+   from BIOS bugs. */
+#define NODE_MIN_SIZE (4*1024*1024)
+static __init int setup_node(int pxm)
+{
+        return acpi_map_pxm_to_node(pxm);
+}
+static __init int conflicting_nodes(unsigned long start, unsigned long end)
+{
+        int i;
+        for_each_node_mask(i, nodes_parsed) {
+                struct bootnode *nd = &nodes[i];
+                if (nd->start == nd->end)
+                        continue;
+                if (nd->end > start && nd->start < end)
+                        return i;
+                if (nd->end == end && nd->start == start)
+                        return i;
+        }
+        return -1;
+}
+static __init void cutoff_node(int i, unsigned long start, unsigned long end)
+{
+        struct bootnode *nd = &nodes[i];
+        if (found_add_area)
+                return;
+        if (nd->start < start) {
+                nd->start = start;
+                if (nd->end < nd->start)
+                        nd->start = nd->end;
+        }
+        if (nd->end > end) {
+                nd->end = end;
+                if (nd->start > nd->end)
+                        nd->start = nd->end;
+        }
+}
+static __init void bad_srat(void)
+{
+        int i;
+        printk(KERN_ERR "SRAT: SRAT not used.\n");
+        acpi_numa = -1;
+        found_add_area = 0;
+        for (i = 0; i < MAX_LOCAL_APIC; i++)
+                apicid_to_node[i] = NUMA_NO_NODE;
+        for (i = 0; i < MAX_NUMNODES; i++)
+                nodes_add[i].start = nodes[i].end = 0;
+        remove_all_active_ranges();
+}
+static __init inline int srat_disabled(void)
+{
+        return numa_off || acpi_numa < 0;
+}
+/*
+ * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
+ * up the NUMA heuristics which wants the local node to have a smaller
+ * distance than the others.
+ * Do some quick checks here and only use the SLIT if it passes.
+ */
+static __init int slit_valid(struct acpi_table_slit *slit)
+{
+        int i, j;
+        int d = slit->locality_count;
+        for (i = 0; i < d; i++) {
+                for (j = 0; j < d; j++)  {
+                        u8 val = slit->entry[d*i + j];
+                        if (i == j) {
+                                if (val != LOCAL_DISTANCE)
+                                        return 0;
+                        } else if (val <= LOCAL_DISTANCE)
+                                return 0;
+                }
+        }
+        return 1;
+}
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+        if (!slit_valid(slit)) {
+                printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
+                return;
+        }
+        acpi_slit = slit;
+}
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
+{
+        int pxm, node;
+        if (srat_disabled())
+                return;
+        if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
+                bad_srat();
+                return;
+        }
+        if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+                return;
+        pxm = pa->proximity_domain_lo;
+        node = setup_node(pxm);
+        if (node < 0) {
+                printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+                bad_srat();
+                return;
+        }
+        apicid_to_node[pa->apic_id] = node;
+        acpi_numa = 1;
+        printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+               pxm, pa->apic_id, node);
+}
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+/*
+ * Protect against too large hotadd areas that would fill up memory.
+ */
+static int hotadd_enough_memory(struct bootnode *nd)
+{
+        static unsigned long allocated;
+        static unsigned long last_area_end;
+        unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
+        long mem = pages * sizeof(struct page);
+        unsigned long addr;
+        unsigned long allowed;
+        unsigned long oldpages = pages;
+        if (mem < 0)
+                return 0;
+        allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
+        allowed = (allowed / 100) * hotadd_percent;
+        if (allocated + mem > allowed) {
+                unsigned long range;
+                /* Give them at least part of their hotadd memory upto hotadd_percent
+                   It would be better to spread the limit out
+                   over multiple hotplug areas, but that is too complicated
+                   right now */
+                if (allocated >= allowed)
+                        return 0;
+                range = allowed - allocated;
+                pages = (range / PAGE_SIZE);
+                mem = pages * sizeof(struct page);
+                nd->end = nd->start + range;
+        }
+        /* Not completely fool proof, but a good sanity check */
+        addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
+        if (addr == -1UL)
+                return 0;
+        if (pages != oldpages)
+                printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
+                        pages << PAGE_SHIFT);
+        last_area_end = addr + mem;
+        allocated += mem;
+        return 1;
+}
+static int update_end_of_memory(unsigned long end)
+{
+        found_add_area = 1;
+        if ((end >> PAGE_SHIFT) > end_pfn)
+                end_pfn = end >> PAGE_SHIFT;
+        return 1;
+}
+static inline int save_add_info(void)
+{
+        return hotadd_percent > 0;
+}
+#else
+int update_end_of_memory(unsigned long end) {return -1;}
+static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+static inline int save_add_info(void) {return 1;}
+#else
+static inline int save_add_info(void) {return 0;}
+#endif
+#endif
+/*
+ * Update nodes_add and decide if to include add are in the zone.
+ * Both SPARSE and RESERVE need nodes_add infomation.
+ * This code supports one contigious hot add area per node.
+ */
+static int reserve_hotadd(int node, unsigned long start, unsigned long end)
+{
+        unsigned long s_pfn = start >> PAGE_SHIFT;
+        unsigned long e_pfn = end >> PAGE_SHIFT;
+        int ret = 0, changed = 0;
+        struct bootnode *nd = &nodes_add[node];
+        /* I had some trouble with strange memory hotadd regions breaking
+           the boot. Be very strict here and reject anything unexpected.
+           If you want working memory hotadd write correct SRATs.
+           The node size check is a basic sanity check to guard against
+           mistakes */
+        if ((signed long)(end - start) < NODE_MIN_SIZE) {
+                printk(KERN_ERR "SRAT: Hotplug area too small\n");
+                return -1;
+        }
+        /* This check might be a bit too strict, but I'm keeping it for now. */
+        if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
+                printk(KERN_ERR
+                        "SRAT: Hotplug area %lu -> %lu has existing memory\n",
+                        s_pfn, e_pfn);
+                return -1;
+        }
+        if (!hotadd_enough_memory(&nodes_add[node]))  {
+                printk(KERN_ERR "SRAT: Hotplug area too large\n");
+                return -1;
+        }
+        /* Looks good */
+        if (nd->start == nd->end) {
+                nd->start = start;
+                nd->end = end;
+                changed = 1;
+        } else {
+                if (nd->start == end) {
+                        nd->start = start;
+                        changed = 1;
+                }
+                if (nd->end == start) {
+                        nd->end = end;
+                        changed = 1;
+                }
+                if (!changed)
+                        printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
+        }
+        ret = update_end_of_memory(nd->end);
+        if (changed)
+                printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
+        return ret;
+}
+/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+void __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
+{
+        struct bootnode *nd, oldnode;
+        unsigned long start, end;
+        int node, pxm;
+        int i;
+        if (srat_disabled())
+                return;
+        if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
+                bad_srat();
+                return;
+        }
+        if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+                return;
+        if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
+                return;
+        start = ma->base_address;
+        end = start + ma->length;
+        pxm = ma->proximity_domain;
+        node = setup_node(pxm);
+        if (node < 0) {
+                printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+                bad_srat();
+                return;
+        }
+        i = conflicting_nodes(start, end);
+        if (i == node) {
+                printk(KERN_WARNING
+                "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
+                        pxm, start, end, nodes[i].start, nodes[i].end);
+        } else if (i >= 0) {
+                printk(KERN_ERR
+                       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
+                       pxm, start, end, node_to_pxm(i),
+                        nodes[i].start, nodes[i].end);
+                bad_srat();
+                return;
+        }
+        nd = &nodes[node];
+        oldnode = *nd;
+        if (!node_test_and_set(node, nodes_parsed)) {
+                nd->start = start;
+                nd->end = end;
+        } else {
+                if (start < nd->start)
+                        nd->start = start;
+                if (nd->end < end)
+                        nd->end = end;
+        }
+        printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
+               nd->start, nd->end);
+        e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
+                                                nd->end >> PAGE_SHIFT);
+        push_node_boundaries(node, nd->start >> PAGE_SHIFT,
+                                                nd->end >> PAGE_SHIFT);
+        if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
+            (reserve_hotadd(node, start, end) < 0)) {
+                /* Ignore hotadd region. Undo damage */
+                printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+                *nd = oldnode;
+                if ((nd->start | nd->end) == 0)
+                        node_clear(node, nodes_parsed);
+        }
+}
+/* Sanity check to catch more bad SRATs (they are amazingly common).
+   Make sure the PXMs cover all memory. */
+static int __init nodes_cover_memory(const struct bootnode *nodes)
+{
+        int i;
+        unsigned long pxmram, e820ram;
+        pxmram = 0;
+        for_each_node_mask(i, nodes_parsed) {
+                unsigned long s = nodes[i].start >> PAGE_SHIFT;
+                unsigned long e = nodes[i].end >> PAGE_SHIFT;
+                pxmram += e - s;
+                pxmram -= absent_pages_in_range(s, e);
+                if ((long)pxmram < 0)
+                        pxmram = 0;
+        }
+        e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
+        /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
+        if ((long)(e820ram - pxmram) >= 1*1024*1024) {
+                printk(KERN_ERR
+        "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
+                        (pxmram << PAGE_SHIFT) >> 20,
+                        (e820ram << PAGE_SHIFT) >> 20);
+                return 0;
+        }
+        return 1;
+}
+static void unparse_node(int node)
+{
+        int i;
+        node_clear(node, nodes_parsed);
+        for (i = 0; i < MAX_LOCAL_APIC; i++) {
+                if (apicid_to_node[i] == node)
+                        apicid_to_node[i] = NUMA_NO_NODE;
+        }
+}
+void __init acpi_numa_arch_fixup(void) {}
+/* Use the information discovered above to actually set up the nodes. */
+int __init acpi_scan_nodes(unsigned long start, unsigned long end)
+{
+        int i;
+        if (acpi_numa <= 0)
+                return -1;
+        /* First clean up the node list */
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                cutoff_node(i, start, end);
+                if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
+                        unparse_node(i);
+                        node_set_offline(i);
+                }
+        }
+        if (!nodes_cover_memory(nodes)) {
+                bad_srat();
+                return -1;
+        }
+        memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
+        if (memnode_shift < 0) {
+                printk(KERN_ERR
+                     "SRAT: No NUMA node hash function found. Contact maintainer\n");
+                bad_srat();
+                return -1;
+        }
+        node_possible_map = nodes_parsed;
+        /* Finally register nodes */
+        for_each_node_mask(i, node_possible_map)
+                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+        /* Try again in case setup_node_bootmem missed one due
+           to missing bootmem */
+        for_each_node_mask(i, node_possible_map)
+                if (!node_online(i))
+                        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+        for (i = 0; i < NR_CPUS; i++) {
+                if (cpu_to_node[i] == NUMA_NO_NODE)
+                        continue;
+                if (!node_isset(cpu_to_node[i], node_possible_map))
+                        numa_set_node(i, NUMA_NO_NODE);
+        }
+        numa_init_array();
+        return 0;
+}
+#ifdef CONFIG_NUMA_EMU
+static int __init find_node_by_addr(unsigned long addr)
+{
+        int ret = NUMA_NO_NODE;
+        int i;
+        for_each_node_mask(i, nodes_parsed) {
+                /*
+                 * Find the real node that this emulated node appears on.  For
+                 * the sake of simplicity, we only use a real node's starting
+                 * address to determine which emulated node it appears on.
+                 */
+                if (addr >= nodes[i].start && addr < nodes[i].end) {
+                        ret = i;
+                        break;
+                }
+        }
+        return i;
+}
+/*
+ * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
+ * mappings that respect the real ACPI topology but reflect our emulated
+ * environment.  For each emulated node, we find which real node it appears on
+ * and create PXM to NID mappings for those fake nodes which mirror that
+ * locality.  SLIT will now represent the correct distances between emulated
+ * nodes as a result of the real topology.
+ */
+void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
+{
+        int i, j;
+        int fake_node_to_pxm_map[MAX_NUMNODES] = {
+                [0 ... MAX_NUMNODES-1] = PXM_INVAL
+        };
+        unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
+                [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+        };
+        printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
+                         "topology.\n");
+        for (i = 0; i < num_nodes; i++) {
+                int nid, pxm;
+                nid = find_node_by_addr(fake_nodes[i].start);
+                if (nid == NUMA_NO_NODE)
+                        continue;
+                pxm = node_to_pxm(nid);
+                if (pxm == PXM_INVAL)
+                        continue;
+                fake_node_to_pxm_map[i] = pxm;
+                /*
+                 * For each apicid_to_node mapping that exists for this real
+                 * node, it must now point to the fake node ID.
+                 */
+                for (j = 0; j < MAX_LOCAL_APIC; j++)
+                        if (apicid_to_node[j] == nid)
+                                fake_apicid_to_node[j] = i;
+        }
+        for (i = 0; i < num_nodes; i++)
+                __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
+        memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
+        nodes_clear(nodes_parsed);
+        for (i = 0; i < num_nodes; i++)
+                if (fake_nodes[i].start != fake_nodes[i].end)
+                        node_set(i, nodes_parsed);
+        WARN_ON(!nodes_cover_memory(fake_nodes));
+}
+static int null_slit_node_compare(int a, int b)
+{
+        return node_to_pxm(a) == node_to_pxm(b);
+}
+#else
+static int null_slit_node_compare(int a, int b)
+{
+        return a == b;
+}
+#endif /* CONFIG_NUMA_EMU */
+void __init srat_reserve_add_area(int nodeid)
+{
+        if (found_add_area && nodes_add[nodeid].end) {
+                u64 total_mb;
+                printk(KERN_INFO "SRAT: Reserving hot-add memory space "
+                                "for node %d at %Lx-%Lx\n",
+                        nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
+                total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
+                                        >> PAGE_SHIFT;
+                total_mb *= sizeof(struct page);
+                total_mb >>= 20;
+                printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
+                                "pre-allocated memory.\n", (unsigned long long)total_mb);
+                reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
+                               nodes_add[nodeid].end - nodes_add[nodeid].start);
+        }
+}
+int __node_distance(int a, int b)
+{
+        int index;
+        if (!acpi_slit)
+                return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
+                                                      REMOTE_DISTANCE;
+        index = acpi_slit->locality_count * node_to_pxm(a);
+        return acpi_slit->entry[index + node_to_pxm(b)];
+}
+EXPORT_SYMBOL(__node_distance);
+int memory_add_physaddr_to_nid(u64 start)
+{
+        int i, ret = 0;
+        for_each_node(i)
+                if (nodes_add[i].start <= start && nodes_add[i].end > start)
+                        ret = i;
+        return ret;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
diff --git a/arch/x86/oprofile/Kconfig b/arch/x86/oprofile/Kconfig
new file mode 100644
index 000000000000..d8a84088471a
--- /dev/null
+++ b/arch/x86/oprofile/Kconfig
@@ -0,0 +1,17 @@
+config PROFILING
+        bool "Profiling support (EXPERIMENTAL)"
+        help
+          Say Y here to enable the extended profiling support mechanisms used
+          by profilers such as OProfile.
+          
+config OPROFILE
+        tristate "OProfile system profiling (EXPERIMENTAL)"
+        depends on PROFILING
+        help
+          OProfile is a profiling system capable of profiling the
+          whole system, include the kernel, kernel modules, libraries,
+          and applications.
+          If unsure, say N.
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
new file mode 100644
index 000000000000..30f3eb366667
--- /dev/null
+++ b/arch/x86/oprofile/Makefile
@@ -0,0 +1,12 @@
+obj-$(CONFIG_OPROFILE) += oprofile.o
+DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
+                oprof.o cpu_buffer.o buffer_sync.o \
+                event_buffer.o oprofile_files.o \
+                oprofilefs.o oprofile_stats.o  \
+                timer_int.o )
+oprofile-y                              := $(DRIVER_OBJS) init.o backtrace.o
+oprofile-$(CONFIG_X86_LOCAL_APIC)       += nmi_int.o op_model_athlon.o \
+                                           op_model_ppro.o op_model_p4.o
+oprofile-$(CONFIG_X86_IO_APIC)          += nmi_timer_int.o
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
new file mode 100644
index 000000000000..c049ce414f01
--- /dev/null
+++ b/arch/x86/oprofile/backtrace.c
@@ -0,0 +1,127 @@
+/**
+ * @file backtrace.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon
+ * @author David Smith
+ */
+#include <linux/oprofile.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+struct frame_head {
+        struct frame_head * ebp;
+        unsigned long ret;
+} __attribute__((packed));
+static struct frame_head *
+dump_kernel_backtrace(struct frame_head * head)
+{
+        oprofile_add_trace(head->ret);
+        /* frame pointers should strictly progress back up the stack
+         * (towards higher addresses) */
+        if (head >= head->ebp)
+                return NULL;
+        return head->ebp;
+}
+static struct frame_head *
+dump_user_backtrace(struct frame_head * head)
+{
+        struct frame_head bufhead[2];
+        /* Also check accessibility of one struct frame_head beyond */
+        if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
+                return NULL;
+        if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+                return NULL;
+        oprofile_add_trace(bufhead[0].ret);
+        /* frame pointers should strictly progress back up the stack
+         * (towards higher addresses) */
+        if (head >= bufhead[0].ebp)
+                return NULL;
+        return bufhead[0].ebp;
+}
+/*
+ * |             | /\ Higher addresses
+ * |             |
+ * --------------- stack base (address of current_thread_info)
+ * | thread info |
+ * .             .
+ * |    stack    |
+ * --------------- saved regs->ebp value if valid (frame_head address)
+ * .             .
+ * --------------- saved regs->rsp value if x86_64
+ * |             |
+ * --------------- struct pt_regs * stored on stack if 32-bit
+ * |             |
+ * .             .
+ * |             |
+ * --------------- %esp
+ * |             |
+ * |             | \/ Lower addresses
+ *
+ * Thus, regs (or regs->rsp for x86_64) <-> stack base restricts the
+ * valid(ish) ebp values. Note: (1) for x86_64, NMI and several other
+ * exceptions use special stacks, maintained by the interrupt stack table
+ * (IST). These stacks are set up in trap_init() in
+ * arch/x86_64/kernel/traps.c. Thus, for x86_64, regs now does not point
+ * to the kernel stack; instead, it points to some location on the NMI
+ * stack. On the other hand, regs->rsp is the stack pointer saved when the
+ * NMI occurred. (2) For 32-bit, regs->esp is not valid because the
+ * processor does not save %esp on the kernel stack when interrupts occur
+ * in the kernel mode.
+ */
+#ifdef CONFIG_FRAME_POINTER
+static int valid_kernel_stack(struct frame_head * head, struct pt_regs * regs)
+{
+        unsigned long headaddr = (unsigned long)head;
+#ifdef CONFIG_X86_64
+        unsigned long stack = (unsigned long)regs->rsp;
+#else
+        unsigned long stack = (unsigned long)regs;
+#endif
+        unsigned long stack_base = (stack & ~(THREAD_SIZE - 1)) + THREAD_SIZE;
+        return headaddr > stack && headaddr < stack_base;
+}
+#else
+/* without fp, it's just junk */
+static int valid_kernel_stack(struct frame_head * head, struct pt_regs * regs)
+{
+        return 0;
+}
+#endif
+void
+x86_backtrace(struct pt_regs * const regs, unsigned int depth)
+{
+        struct frame_head *head;
+#ifdef CONFIG_X86_64
+        head = (struct frame_head *)regs->rbp;
+#else
+        head = (struct frame_head *)regs->ebp;
+#endif
+        if (!user_mode_vm(regs)) {
+                while (depth-- && valid_kernel_stack(head, regs))
+                        head = dump_kernel_backtrace(head);
+                return;
+        }
+        while (depth-- && head)
+                head = dump_user_backtrace(head);
+}
diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c
new file mode 100644
index 000000000000..5341d481d92f
--- /dev/null
+++ b/arch/x86/oprofile/init.c
@@ -0,0 +1,48 @@
+/**
+ * @file init.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
+ */
+#include <linux/oprofile.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+ 
+/* We support CPUs that have performance counters like the Pentium Pro
+ * with the NMI mode driver.
+ */
+ 
+extern int op_nmi_init(struct oprofile_operations * ops);
+extern int op_nmi_timer_init(struct oprofile_operations * ops);
+extern void op_nmi_exit(void);
+extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
+int __init oprofile_arch_init(struct oprofile_operations * ops)
+{
+        int ret;
+        ret = -ENODEV;
+#ifdef CONFIG_X86_LOCAL_APIC
+        ret = op_nmi_init(ops);
+#endif
+#ifdef CONFIG_X86_IO_APIC
+        if (ret < 0)
+                ret = op_nmi_timer_init(ops);
+#endif
+        ops->backtrace = x86_backtrace;
+        return ret;
+}
+void oprofile_arch_exit(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+        op_nmi_exit();
+#endif
+}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
new file mode 100644
index 000000000000..11b7a51566a8
--- /dev/null
+++ b/arch/x86/oprofile/nmi_int.c
@@ -0,0 +1,477 @@
+/**
+ * @file nmi_int.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
+ */
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/oprofile.h>
+#include <linux/sysdev.h>
+#include <linux/slab.h>
+#include <linux/moduleparam.h>
+#include <linux/kdebug.h>
+#include <asm/nmi.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+ 
+#include "op_counter.h"
+#include "op_x86_model.h"
+static struct op_x86_model_spec const * model;
+static struct op_msrs cpu_msrs[NR_CPUS];
+static unsigned long saved_lvtpc[NR_CPUS];
+static int nmi_start(void);
+static void nmi_stop(void);
+/* 0 == registered but off, 1 == registered and on */
+static int nmi_enabled = 0;
+#ifdef CONFIG_PM
+static int nmi_suspend(struct sys_device *dev, pm_message_t state)
+{
+        if (nmi_enabled == 1)
+                nmi_stop();
+        return 0;
+}
+static int nmi_resume(struct sys_device *dev)
+{
+        if (nmi_enabled == 1)
+                nmi_start();
+        return 0;
+}
+static struct sysdev_class oprofile_sysclass = {
+        set_kset_name("oprofile"),
+        .resume         = nmi_resume,
+        .suspend        = nmi_suspend,
+};
+static struct sys_device device_oprofile = {
+        .id     = 0,
+        .cls    = &oprofile_sysclass,
+};
+static int __init init_sysfs(void)
+{
+        int error;
+        if (!(error = sysdev_class_register(&oprofile_sysclass)))
+                error = sysdev_register(&device_oprofile);
+        return error;
+}
+static void exit_sysfs(void)
+{
+        sysdev_unregister(&device_oprofile);
+        sysdev_class_unregister(&oprofile_sysclass);
+}
+#else
+#define init_sysfs() do { } while (0)
+#define exit_sysfs() do { } while (0)
+#endif /* CONFIG_PM */
+static int profile_exceptions_notify(struct notifier_block *self,
+                                     unsigned long val, void *data)
+{
+        struct die_args *args = (struct die_args *)data;
+        int ret = NOTIFY_DONE;
+        int cpu = smp_processor_id();
+        switch(val) {
+        case DIE_NMI:
+                if (model->check_ctrs(args->regs, &cpu_msrs[cpu]))
+                        ret = NOTIFY_STOP;
+                break;
+        default:
+                break;
+        }
+        return ret;
+}
+static void nmi_cpu_save_registers(struct op_msrs * msrs)
+{
+        unsigned int const nr_ctrs = model->num_counters;
+        unsigned int const nr_ctrls = model->num_controls; 
+        struct op_msr * counters = msrs->counters;
+        struct op_msr * controls = msrs->controls;
+        unsigned int i;
+        for (i = 0; i < nr_ctrs; ++i) {
+                if (counters[i].addr){
+                        rdmsr(counters[i].addr,
+                                counters[i].saved.low,
+                                counters[i].saved.high);
+                }
+        }
+ 
+        for (i = 0; i < nr_ctrls; ++i) {
+                if (controls[i].addr){
+                        rdmsr(controls[i].addr,
+                                controls[i].saved.low,
+                                controls[i].saved.high);
+                }
+        }
+}
+static void nmi_save_registers(void * dummy)
+{
+        int cpu = smp_processor_id();
+        struct op_msrs * msrs = &cpu_msrs[cpu];
+        nmi_cpu_save_registers(msrs);
+}
+static void free_msrs(void)
+{
+        int i;
+        for_each_possible_cpu(i) {
+                kfree(cpu_msrs[i].counters);
+                cpu_msrs[i].counters = NULL;
+                kfree(cpu_msrs[i].controls);
+                cpu_msrs[i].controls = NULL;
+        }
+}
+static int allocate_msrs(void)
+{
+        int success = 1;
+        size_t controls_size = sizeof(struct op_msr) * model->num_controls;
+        size_t counters_size = sizeof(struct op_msr) * model->num_counters;
+        int i;
+        for_each_possible_cpu(i) {
+                cpu_msrs[i].counters = kmalloc(counters_size, GFP_KERNEL);
+                if (!cpu_msrs[i].counters) {
+                        success = 0;
+                        break;
+                }
+                cpu_msrs[i].controls = kmalloc(controls_size, GFP_KERNEL);
+                if (!cpu_msrs[i].controls) {
+                        success = 0;
+                        break;
+                }
+        }
+        if (!success)
+                free_msrs();
+        return success;
+}
+static void nmi_cpu_setup(void * dummy)
+{
+        int cpu = smp_processor_id();
+        struct op_msrs * msrs = &cpu_msrs[cpu];
+        spin_lock(&oprofilefs_lock);
+        model->setup_ctrs(msrs);
+        spin_unlock(&oprofilefs_lock);
+        saved_lvtpc[cpu] = apic_read(APIC_LVTPC);
+        apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+static struct notifier_block profile_exceptions_nb = {
+        .notifier_call = profile_exceptions_notify,
+        .next = NULL,
+        .priority = 0
+};
+static int nmi_setup(void)
+{
+        int err=0;
+        int cpu;
+        if (!allocate_msrs())
+                return -ENOMEM;
+        if ((err = register_die_notifier(&profile_exceptions_nb))){
+                free_msrs();
+                return err;
+        }
+        /* We need to serialize save and setup for HT because the subset
+         * of msrs are distinct for save and setup operations
+         */
+        /* Assume saved/restored counters are the same on all CPUs */
+        model->fill_in_addresses(&cpu_msrs[0]);
+        for_each_possible_cpu (cpu) {
+                if (cpu != 0) {
+                        memcpy(cpu_msrs[cpu].counters, cpu_msrs[0].counters,
+                                sizeof(struct op_msr) * model->num_counters);
+                        memcpy(cpu_msrs[cpu].controls, cpu_msrs[0].controls,
+                                sizeof(struct op_msr) * model->num_controls);
+                }
+        }
+        on_each_cpu(nmi_save_registers, NULL, 0, 1);
+        on_each_cpu(nmi_cpu_setup, NULL, 0, 1);
+        nmi_enabled = 1;
+        return 0;
+}
+static void nmi_restore_registers(struct op_msrs * msrs)
+{
+        unsigned int const nr_ctrs = model->num_counters;
+        unsigned int const nr_ctrls = model->num_controls; 
+        struct op_msr * counters = msrs->counters;
+        struct op_msr * controls = msrs->controls;
+        unsigned int i;
+        for (i = 0; i < nr_ctrls; ++i) {
+                if (controls[i].addr){
+                        wrmsr(controls[i].addr,
+                                controls[i].saved.low,
+                                controls[i].saved.high);
+                }
+        }
+ 
+        for (i = 0; i < nr_ctrs; ++i) {
+                if (counters[i].addr){
+                        wrmsr(counters[i].addr,
+                                counters[i].saved.low,
+                                counters[i].saved.high);
+                }
+        }
+}
+ 
+static void nmi_cpu_shutdown(void * dummy)
+{
+        unsigned int v;
+        int cpu = smp_processor_id();
+        struct op_msrs * msrs = &cpu_msrs[cpu];
+ 
+        /* restoring APIC_LVTPC can trigger an apic error because the delivery
+         * mode and vector nr combination can be illegal. That's by design: on
+         * power on apic lvt contain a zero vector nr which are legal only for
+         * NMI delivery mode. So inhibit apic err before restoring lvtpc
+         */
+        v = apic_read(APIC_LVTERR);
+        apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+        apic_write(APIC_LVTPC, saved_lvtpc[cpu]);
+        apic_write(APIC_LVTERR, v);
+        nmi_restore_registers(msrs);
+        model->shutdown(msrs);
+}
+ 
+static void nmi_shutdown(void)
+{
+        nmi_enabled = 0;
+        on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1);
+        unregister_die_notifier(&profile_exceptions_nb);
+        free_msrs();
+}
+ 
+static void nmi_cpu_start(void * dummy)
+{
+        struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()];
+        model->start(msrs);
+}
+ 
+static int nmi_start(void)
+{
+        on_each_cpu(nmi_cpu_start, NULL, 0, 1);
+        return 0;
+}
+ 
+ 
+static void nmi_cpu_stop(void * dummy)
+{
+        struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()];
+        model->stop(msrs);
+}
+ 
+ 
+static void nmi_stop(void)
+{
+        on_each_cpu(nmi_cpu_stop, NULL, 0, 1);
+}
+struct op_counter_config counter_config[OP_MAX_COUNTER];
+static int nmi_create_files(struct super_block * sb, struct dentry * root)
+{
+        unsigned int i;
+        for (i = 0; i < model->num_counters; ++i) {
+                struct dentry * dir;
+                char buf[4];
+ 
+                /* quick little hack to _not_ expose a counter if it is not
+                 * available for use.  This should protect userspace app.
+                 * NOTE:  assumes 1:1 mapping here (that counters are organized
+                 *        sequentially in their struct assignment).
+                 */
+                if (unlikely(!avail_to_resrv_perfctr_nmi_bit(i)))
+                        continue;
+                snprintf(buf,  sizeof(buf), "%d", i);
+                dir = oprofilefs_mkdir(sb, root, buf);
+                oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); 
+                oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); 
+                oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); 
+                oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); 
+                oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); 
+                oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); 
+        }
+        return 0;
+}
+ 
+static int p4force;
+module_param(p4force, int, 0);
+ 
+static int __init p4_init(char ** cpu_type)
+{
+        __u8 cpu_model = boot_cpu_data.x86_model;
+        if (!p4force && (cpu_model > 6 || cpu_model == 5))
+                return 0;
+#ifndef CONFIG_SMP
+        *cpu_type = "i386/p4";
+        model = &op_p4_spec;
+        return 1;
+#else
+        switch (smp_num_siblings) {
+                case 1:
+                        *cpu_type = "i386/p4";
+                        model = &op_p4_spec;
+                        return 1;
+                case 2:
+                        *cpu_type = "i386/p4-ht";
+                        model = &op_p4_ht2_spec;
+                        return 1;
+        }
+#endif
+        printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n");
+        printk(KERN_INFO "oprofile: Reverting to timer mode.\n");
+        return 0;
+}
+static int __init ppro_init(char ** cpu_type)
+{
+        __u8 cpu_model = boot_cpu_data.x86_model;
+        if (cpu_model == 14)
+                *cpu_type = "i386/core";
+        else if (cpu_model == 15)
+                *cpu_type = "i386/core_2";
+        else if (cpu_model > 0xd)
+                return 0;
+        else if (cpu_model == 9) {
+                *cpu_type = "i386/p6_mobile";
+        } else if (cpu_model > 5) {
+                *cpu_type = "i386/piii";
+        } else if (cpu_model > 2) {
+                *cpu_type = "i386/pii";
+        } else {
+                *cpu_type = "i386/ppro";
+        }
+        model = &op_ppro_spec;
+        return 1;
+}
+/* in order to get sysfs right */
+static int using_nmi;
+int __init op_nmi_init(struct oprofile_operations *ops)
+{
+        __u8 vendor = boot_cpu_data.x86_vendor;
+        __u8 family = boot_cpu_data.x86;
+        char *cpu_type;
+        if (!cpu_has_apic)
+                return -ENODEV;
+ 
+        switch (vendor) {
+                case X86_VENDOR_AMD:
+                        /* Needs to be at least an Athlon (or hammer in 32bit mode) */
+                        switch (family) {
+                        default:
+                                return -ENODEV;
+                        case 6:
+                                model = &op_athlon_spec;
+                                cpu_type = "i386/athlon";
+                                break;
+                        case 0xf:
+                                model = &op_athlon_spec;
+                                /* Actually it could be i386/hammer too, but give
+                                   user space an consistent name. */
+                                cpu_type = "x86-64/hammer";
+                                break;
+                        case 0x10:
+                                model = &op_athlon_spec;
+                                cpu_type = "x86-64/family10";
+                                break;
+                        }
+                        break;
+ 
+                case X86_VENDOR_INTEL:
+                        switch (family) {
+                                /* Pentium IV */
+                                case 0xf:
+                                        if (!p4_init(&cpu_type))
+                                                return -ENODEV;
+                                        break;
+                                /* A P6-class processor */
+                                case 6:
+                                        if (!ppro_init(&cpu_type))
+                                                return -ENODEV;
+                                        break;
+                                default:
+                                        return -ENODEV;
+                        }
+                        break;
+                default:
+                        return -ENODEV;
+        }
+        init_sysfs();
+        using_nmi = 1;
+        ops->create_files = nmi_create_files;
+        ops->setup = nmi_setup;
+        ops->shutdown = nmi_shutdown;
+        ops->start = nmi_start;
+        ops->stop = nmi_stop;
+        ops->cpu_type = cpu_type;
+        printk(KERN_INFO "oprofile: using NMI interrupt.\n");
+        return 0;
+}
+void op_nmi_exit(void)
+{
+        if (using_nmi)
+                exit_sysfs();
+}
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
new file mode 100644
index 000000000000..1418e36ae7ab
--- /dev/null
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -0,0 +1,69 @@
+/**
+ * @file nmi_timer_int.c
+ *
+ * @remark Copyright 2003 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author Zwane Mwaikambo <zwane@linuxpower.ca>
+ */
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/oprofile.h>
+#include <linux/rcupdate.h>
+#include <linux/kdebug.h>
+#include <asm/nmi.h>
+#include <asm/apic.h>
+#include <asm/ptrace.h>
+ 
+static int profile_timer_exceptions_notify(struct notifier_block *self,
+                                           unsigned long val, void *data)
+{
+        struct die_args *args = (struct die_args *)data;
+        int ret = NOTIFY_DONE;
+        switch(val) {
+        case DIE_NMI:
+                oprofile_add_sample(args->regs, 0);
+                ret = NOTIFY_STOP;
+                break;
+        default:
+                break;
+        }
+        return ret;
+}
+static struct notifier_block profile_timer_exceptions_nb = {
+        .notifier_call = profile_timer_exceptions_notify,
+        .next = NULL,
+        .priority = 0
+};
+static int timer_start(void)
+{
+        if (register_die_notifier(&profile_timer_exceptions_nb))
+                return 1;
+        return 0;
+}
+static void timer_stop(void)
+{
+        unregister_die_notifier(&profile_timer_exceptions_nb);
+        synchronize_sched();  /* Allow already-started NMIs to complete. */
+}
+int __init op_nmi_timer_init(struct oprofile_operations * ops)
+{
+        if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0))
+                return -ENODEV;
+        ops->start = timer_start;
+        ops->stop = timer_stop;
+        ops->cpu_type = "timer";
+        printk(KERN_INFO "oprofile: using NMI timer interrupt.\n");
+        return 0;
+}
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
new file mode 100644
index 000000000000..2880b15c4675
--- /dev/null
+++ b/arch/x86/oprofile/op_counter.h
@@ -0,0 +1,29 @@
+/**
+ * @file op_counter.h
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon
+ */
+ 
+#ifndef OP_COUNTER_H
+#define OP_COUNTER_H
+ 
+#define OP_MAX_COUNTER 8
+ 
+/* Per-perfctr configuration as set via
+ * oprofilefs.
+ */
+struct op_counter_config {
+        unsigned long count;
+        unsigned long enabled;
+        unsigned long event;
+        unsigned long kernel;
+        unsigned long user;
+        unsigned long unit_mask;
+};
+extern struct op_counter_config counter_config[];
+#endif /* OP_COUNTER_H */
diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c
new file mode 100644
index 000000000000..3057a19e4641
--- /dev/null
+++ b/arch/x86/oprofile/op_model_athlon.c
@@ -0,0 +1,180 @@
+/**
+ * @file op_model_athlon.h
+ * athlon / K7 model-specific MSR operations
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon
+ * @author Philippe Elie
+ * @author Graydon Hoare
+ */
+#include <linux/oprofile.h>
+#include <asm/ptrace.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+ 
+#include "op_x86_model.h"
+#include "op_counter.h"
+#define NUM_COUNTERS 4
+#define NUM_CONTROLS 4
+#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
+#define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0)
+#define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1);} while (0)
+#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
+#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
+#define CTRL_READ(l,h,msrs,c) do {rdmsr(msrs->controls[(c)].addr, (l), (h));} while (0)
+#define CTRL_WRITE(l,h,msrs,c) do {wrmsr(msrs->controls[(c)].addr, (l), (h));} while (0)
+#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
+#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
+#define CTRL_CLEAR(x) (x &= (1<<21))
+#define CTRL_SET_ENABLE(val) (val |= 1<<20)
+#define CTRL_SET_USR(val,u) (val |= ((u & 1) << 16))
+#define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17))
+#define CTRL_SET_UM(val, m) (val |= (m << 8))
+#define CTRL_SET_EVENT(val, e) (val |= e)
+static unsigned long reset_value[NUM_COUNTERS];
+ 
+static void athlon_fill_in_addresses(struct op_msrs * const msrs)
+{
+        int i;
+        for (i=0; i < NUM_COUNTERS; i++) {
+                if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
+                        msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+                else
+                        msrs->counters[i].addr = 0;
+        }
+        for (i=0; i < NUM_CONTROLS; i++) {
+                if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
+                        msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+                else
+                        msrs->controls[i].addr = 0;
+        }
+}
+ 
+static void athlon_setup_ctrs(struct op_msrs const * const msrs)
+{
+        unsigned int low, high;
+        int i;
+ 
+        /* clear all counters */
+        for (i = 0 ; i < NUM_CONTROLS; ++i) {
+                if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
+                        continue;
+                CTRL_READ(low, high, msrs, i);
+                CTRL_CLEAR(low);
+                CTRL_WRITE(low, high, msrs, i);
+        }
+        /* avoid a false detection of ctr overflows in NMI handler */
+        for (i = 0; i < NUM_COUNTERS; ++i) {
+                if (unlikely(!CTR_IS_RESERVED(msrs,i)))
+                        continue;
+                CTR_WRITE(1, msrs, i);
+        }
+        /* enable active counters */
+        for (i = 0; i < NUM_COUNTERS; ++i) {
+                if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) {
+                        reset_value[i] = counter_config[i].count;
+                        CTR_WRITE(counter_config[i].count, msrs, i);
+                        CTRL_READ(low, high, msrs, i);
+                        CTRL_CLEAR(low);
+                        CTRL_SET_ENABLE(low);
+                        CTRL_SET_USR(low, counter_config[i].user);
+                        CTRL_SET_KERN(low, counter_config[i].kernel);
+                        CTRL_SET_UM(low, counter_config[i].unit_mask);
+                        CTRL_SET_EVENT(low, counter_config[i].event);
+                        CTRL_WRITE(low, high, msrs, i);
+                } else {
+                        reset_value[i] = 0;
+                }
+        }
+}
+ 
+static int athlon_check_ctrs(struct pt_regs * const regs,
+                             struct op_msrs const * const msrs)
+{
+        unsigned int low, high;
+        int i;
+        for (i = 0 ; i < NUM_COUNTERS; ++i) {
+                if (!reset_value[i])
+                        continue;
+                CTR_READ(low, high, msrs, i);
+                if (CTR_OVERFLOWED(low)) {
+                        oprofile_add_sample(regs, i);
+                        CTR_WRITE(reset_value[i], msrs, i);
+                }
+        }
+        /* See op_model_ppro.c */
+        return 1;
+}
+ 
+static void athlon_start(struct op_msrs const * const msrs)
+{
+        unsigned int low, high;
+        int i;
+        for (i = 0 ; i < NUM_COUNTERS ; ++i) {
+                if (reset_value[i]) {
+                        CTRL_READ(low, high, msrs, i);
+                        CTRL_SET_ACTIVE(low);
+                        CTRL_WRITE(low, high, msrs, i);
+                }
+        }
+}
+static void athlon_stop(struct op_msrs const * const msrs)
+{
+        unsigned int low,high;
+        int i;
+        /* Subtle: stop on all counters to avoid race with
+         * setting our pm callback */
+        for (i = 0 ; i < NUM_COUNTERS ; ++i) {
+                if (!reset_value[i])
+                        continue;
+                CTRL_READ(low, high, msrs, i);
+                CTRL_SET_INACTIVE(low);
+                CTRL_WRITE(low, high, msrs, i);
+        }
+}
+static void athlon_shutdown(struct op_msrs const * const msrs)
+{
+        int i;
+        for (i = 0 ; i < NUM_COUNTERS ; ++i) {
+                if (CTR_IS_RESERVED(msrs,i))
+                        release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+        }
+        for (i = 0 ; i < NUM_CONTROLS ; ++i) {
+                if (CTRL_IS_RESERVED(msrs,i))
+                        release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+        }
+}
+struct op_x86_model_spec const op_athlon_spec = {
+        .num_counters = NUM_COUNTERS,
+        .num_controls = NUM_CONTROLS,
+        .fill_in_addresses = &athlon_fill_in_addresses,
+        .setup_ctrs = &athlon_setup_ctrs,
+        .check_ctrs = &athlon_check_ctrs,
+        .start = &athlon_start,
+        .stop = &athlon_stop,
+        .shutdown = &athlon_shutdown
+};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
new file mode 100644
index 000000000000..47925927b12f
--- /dev/null
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -0,0 +1,722 @@
+/**
+ * @file op_model_p4.c
+ * P4 model-specific MSR operations
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author Graydon Hoare
+ */
+#include <linux/oprofile.h>
+#include <linux/smp.h>
+#include <asm/msr.h>
+#include <asm/ptrace.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+#include "op_x86_model.h"
+#include "op_counter.h"
+#define NUM_EVENTS 39
+#define NUM_COUNTERS_NON_HT 8
+#define NUM_ESCRS_NON_HT 45
+#define NUM_CCCRS_NON_HT 18
+#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT)
+#define NUM_COUNTERS_HT2 4
+#define NUM_ESCRS_HT2 23
+#define NUM_CCCRS_HT2 9
+#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2)
+static unsigned int num_counters = NUM_COUNTERS_NON_HT;
+static unsigned int num_controls = NUM_CONTROLS_NON_HT;
+/* this has to be checked dynamically since the
+   hyper-threadedness of a chip is discovered at
+   kernel boot-time. */
+static inline void setup_num_counters(void)
+{
+#ifdef CONFIG_SMP
+        if (smp_num_siblings == 2){
+                num_counters = NUM_COUNTERS_HT2;
+                num_controls = NUM_CONTROLS_HT2;
+        }
+#endif
+}
+static int inline addr_increment(void)
+{
+#ifdef CONFIG_SMP
+        return smp_num_siblings == 2 ? 2 : 1;
+#else
+        return 1;
+#endif
+}
+/* tables to simulate simplified hardware view of p4 registers */
+struct p4_counter_binding {
+        int virt_counter;
+        int counter_address;
+        int cccr_address;
+};
+struct p4_event_binding {
+        int escr_select;  /* value to put in CCCR */
+        int event_select; /* value to put in ESCR */
+        struct {
+                int virt_counter; /* for this counter... */
+                int escr_address; /* use this ESCR       */
+        } bindings[2];
+};
+/* nb: these CTR_* defines are a duplicate of defines in
+   event/i386.p4*events. */
+#define CTR_BPU_0      (1 << 0)
+#define CTR_MS_0       (1 << 1)
+#define CTR_FLAME_0    (1 << 2)
+#define CTR_IQ_4       (1 << 3)
+#define CTR_BPU_2      (1 << 4)
+#define CTR_MS_2       (1 << 5)
+#define CTR_FLAME_2    (1 << 6)
+#define CTR_IQ_5       (1 << 7)
+static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
+        { CTR_BPU_0,   MSR_P4_BPU_PERFCTR0,   MSR_P4_BPU_CCCR0 },
+        { CTR_MS_0,    MSR_P4_MS_PERFCTR0,    MSR_P4_MS_CCCR0 },
+        { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
+        { CTR_IQ_4,    MSR_P4_IQ_PERFCTR4,    MSR_P4_IQ_CCCR4 },
+        { CTR_BPU_2,   MSR_P4_BPU_PERFCTR2,   MSR_P4_BPU_CCCR2 },
+        { CTR_MS_2,    MSR_P4_MS_PERFCTR2,    MSR_P4_MS_CCCR2 },
+        { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 },
+        { CTR_IQ_5,    MSR_P4_IQ_PERFCTR5,    MSR_P4_IQ_CCCR5 }
+};
+#define NUM_UNUSED_CCCRS        NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT
+/* p4 event codes in libop/op_event.h are indices into this table. */
+static struct p4_event_binding p4_events[NUM_EVENTS] = {
+        
+        { /* BRANCH_RETIRED */
+                0x05, 0x06, 
+                { {CTR_IQ_4, MSR_P4_CRU_ESCR2},
+                  {CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+        },
+        
+        { /* MISPRED_BRANCH_RETIRED */
+                0x04, 0x03, 
+                { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
+                  { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
+        },
+        
+        { /* TC_DELIVER_MODE */
+                0x01, 0x01,
+                { { CTR_MS_0, MSR_P4_TC_ESCR0},  
+                  { CTR_MS_2, MSR_P4_TC_ESCR1} }
+        },
+        
+        { /* BPU_FETCH_REQUEST */
+                0x00, 0x03, 
+                { { CTR_BPU_0, MSR_P4_BPU_ESCR0},
+                  { CTR_BPU_2, MSR_P4_BPU_ESCR1} }
+        },
+        { /* ITLB_REFERENCE */
+                0x03, 0x18,
+                { { CTR_BPU_0, MSR_P4_ITLB_ESCR0},
+                  { CTR_BPU_2, MSR_P4_ITLB_ESCR1} }
+        },
+        { /* MEMORY_CANCEL */
+                0x05, 0x02,
+                { { CTR_FLAME_0, MSR_P4_DAC_ESCR0},
+                  { CTR_FLAME_2, MSR_P4_DAC_ESCR1} }
+        },
+        { /* MEMORY_COMPLETE */
+                0x02, 0x08,
+                { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
+                  { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
+        },
+        { /* LOAD_PORT_REPLAY */
+                0x02, 0x04, 
+                { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
+                  { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
+        },
+        { /* STORE_PORT_REPLAY */
+                0x02, 0x05,
+                { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
+                  { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
+        },
+        { /* MOB_LOAD_REPLAY */
+                0x02, 0x03,
+                { { CTR_BPU_0, MSR_P4_MOB_ESCR0},
+                  { CTR_BPU_2, MSR_P4_MOB_ESCR1} }
+        },
+        { /* PAGE_WALK_TYPE */
+                0x04, 0x01,
+                { { CTR_BPU_0, MSR_P4_PMH_ESCR0},
+                  { CTR_BPU_2, MSR_P4_PMH_ESCR1} }
+        },
+        { /* BSQ_CACHE_REFERENCE */
+                0x07, 0x0c, 
+                { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
+                  { CTR_BPU_2, MSR_P4_BSU_ESCR1} }
+        },
+        { /* IOQ_ALLOCATION */
+                0x06, 0x03, 
+                { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
+                  { 0, 0 } }
+        },
+        { /* IOQ_ACTIVE_ENTRIES */
+                0x06, 0x1a, 
+                { { CTR_BPU_2, MSR_P4_FSB_ESCR1},
+                  { 0, 0 } }
+        },
+        { /* FSB_DATA_ACTIVITY */
+                0x06, 0x17, 
+                { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
+                  { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
+        },
+        { /* BSQ_ALLOCATION */
+                0x07, 0x05, 
+                { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
+                  { 0, 0 } }
+        },
+        { /* BSQ_ACTIVE_ENTRIES */
+                0x07, 0x06,
+                { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},  
+                  { 0, 0 } }
+        },
+        { /* X87_ASSIST */
+                0x05, 0x03, 
+                { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+                  { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+        },
+        { /* SSE_INPUT_ASSIST */
+                0x01, 0x34,
+                { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+                  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+        },
+  
+        { /* PACKED_SP_UOP */
+                0x01, 0x08, 
+                { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+                  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+        },
+  
+        { /* PACKED_DP_UOP */
+                0x01, 0x0c, 
+                { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+                  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+        },
+        { /* SCALAR_SP_UOP */
+                0x01, 0x0a, 
+                { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+                  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+        },
+        { /* SCALAR_DP_UOP */
+                0x01, 0x0e,
+                { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+                  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+        },
+        { /* 64BIT_MMX_UOP */
+                0x01, 0x02, 
+                { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+                  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+        },
+  
+        { /* 128BIT_MMX_UOP */
+                0x01, 0x1a, 
+                { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+                  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+        },
+        { /* X87_FP_UOP */
+                0x01, 0x04, 
+                { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+                  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+        },
+  
+        { /* X87_SIMD_MOVES_UOP */
+                0x01, 0x2e, 
+                { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+                  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+        },
+  
+        { /* MACHINE_CLEAR */
+                0x05, 0x02, 
+                { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+                  { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+        },
+        { /* GLOBAL_POWER_EVENTS */
+                0x06, 0x13 /* older manual says 0x05, newer 0x13 */,
+                { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
+                  { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
+        },
+  
+        { /* TC_MS_XFER */
+                0x00, 0x05, 
+                { { CTR_MS_0, MSR_P4_MS_ESCR0},
+                  { CTR_MS_2, MSR_P4_MS_ESCR1} }
+        },
+        { /* UOP_QUEUE_WRITES */
+                0x00, 0x09,
+                { { CTR_MS_0, MSR_P4_MS_ESCR0},
+                  { CTR_MS_2, MSR_P4_MS_ESCR1} }
+        },
+        { /* FRONT_END_EVENT */
+                0x05, 0x08,
+                { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+                  { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+        },
+        { /* EXECUTION_EVENT */
+                0x05, 0x0c,
+                { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+                  { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+        },
+        { /* REPLAY_EVENT */
+                0x05, 0x09,
+                { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+                  { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+        },
+        { /* INSTR_RETIRED */
+                0x04, 0x02, 
+                { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
+                  { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
+        },
+        { /* UOPS_RETIRED */
+                0x04, 0x01,
+                { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
+                  { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
+        },
+        { /* UOP_TYPE */    
+                0x02, 0x02, 
+                { { CTR_IQ_4, MSR_P4_RAT_ESCR0},
+                  { CTR_IQ_5, MSR_P4_RAT_ESCR1} }
+        },
+        { /* RETIRED_MISPRED_BRANCH_TYPE */
+                0x02, 0x05, 
+                { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
+                  { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
+        },
+        { /* RETIRED_BRANCH_TYPE */
+                0x02, 0x04,
+                { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
+                  { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
+        }
+};
+#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7)
+#define ESCR_RESERVED_BITS 0x80000003
+#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS)
+#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2))
+#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3))
+#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1)))
+#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
+#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
+#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
+#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0)
+#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0)
+#define CCCR_RESERVED_BITS 0x38030FFF
+#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
+#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000)
+#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13))
+#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26))
+#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
+#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
+#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
+#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0)
+#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0)
+#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
+#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
+#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
+#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
+#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0)
+#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0)
+#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
+/* this assigns a "stagger" to the current CPU, which is used throughout
+   the code in this module as an extra array offset, to select the "even"
+   or "odd" part of all the divided resources. */
+static unsigned int get_stagger(void)
+{
+#ifdef CONFIG_SMP
+        int cpu = smp_processor_id();
+        return (cpu != first_cpu(cpu_sibling_map[cpu]));
+#endif  
+        return 0;
+}
+/* finally, mediate access to a real hardware counter
+   by passing a "virtual" counter numer to this macro,
+   along with your stagger setting. */
+#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger)))
+static unsigned long reset_value[NUM_COUNTERS_NON_HT];
+static void p4_fill_in_addresses(struct op_msrs * const msrs)
+{
+        unsigned int i; 
+        unsigned int addr, cccraddr, stag;
+        setup_num_counters();
+        stag = get_stagger();
+        /* initialize some registers */
+        for (i = 0; i < num_counters; ++i) {
+                msrs->counters[i].addr = 0;
+        }
+        for (i = 0; i < num_controls; ++i) {
+                msrs->controls[i].addr = 0;
+        }
+        
+        /* the counter & cccr registers we pay attention to */
+        for (i = 0; i < num_counters; ++i) {
+                addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
+                cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
+                if (reserve_perfctr_nmi(addr)){
+                        msrs->counters[i].addr = addr;
+                        msrs->controls[i].addr = cccraddr;
+                }
+        }
+        /* 43 ESCR registers in three or four discontiguous group */
+        for (addr = MSR_P4_BSU_ESCR0 + stag;
+             addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) {
+                if (reserve_evntsel_nmi(addr))
+                        msrs->controls[i].addr = addr;
+        }
+        /* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1
+         * to avoid special case in nmi_{save|restore}_registers() */
+        if (boot_cpu_data.x86_model >= 0x3) {
+                for (addr = MSR_P4_BSU_ESCR0 + stag;
+                     addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) {
+                        if (reserve_evntsel_nmi(addr))
+                                msrs->controls[i].addr = addr;
+                }
+        } else {
+                for (addr = MSR_P4_IQ_ESCR0 + stag;
+                     addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) {
+                        if (reserve_evntsel_nmi(addr))
+                                msrs->controls[i].addr = addr;
+                }
+        }
+        for (addr = MSR_P4_RAT_ESCR0 + stag;
+             addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) {
+                if (reserve_evntsel_nmi(addr))
+                        msrs->controls[i].addr = addr;
+        }
+        
+        for (addr = MSR_P4_MS_ESCR0 + stag;
+             addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { 
+                if (reserve_evntsel_nmi(addr))
+                        msrs->controls[i].addr = addr;
+        }
+        
+        for (addr = MSR_P4_IX_ESCR0 + stag;
+             addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { 
+                if (reserve_evntsel_nmi(addr))
+                        msrs->controls[i].addr = addr;
+        }
+        /* there are 2 remaining non-contiguously located ESCRs */
+        if (num_counters == NUM_COUNTERS_NON_HT) {              
+                /* standard non-HT CPUs handle both remaining ESCRs*/
+                if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
+                        msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
+                if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
+                        msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
+        } else if (stag == 0) {
+                /* HT CPUs give the first remainder to the even thread, as
+                   the 32nd control register */
+                if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
+                        msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
+        } else {
+                /* and two copies of the second to the odd thread,
+                   for the 22st and 23nd control registers */
+                if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) {
+                        msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
+                        msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
+                }
+        }
+}
+static void pmc_setup_one_p4_counter(unsigned int ctr)
+{
+        int i;
+        int const maxbind = 2;
+        unsigned int cccr = 0;
+        unsigned int escr = 0;
+        unsigned int high = 0;
+        unsigned int counter_bit;
+        struct p4_event_binding *ev = NULL;
+        unsigned int stag;
+        stag = get_stagger();
+        
+        /* convert from counter *number* to counter *bit* */
+        counter_bit = 1 << VIRT_CTR(stag, ctr);
+        
+        /* find our event binding structure. */
+        if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
+                printk(KERN_ERR 
+                       "oprofile: P4 event code 0x%lx out of range\n", 
+                       counter_config[ctr].event);
+                return;
+        }
+        
+        ev = &(p4_events[counter_config[ctr].event - 1]);
+        
+        for (i = 0; i < maxbind; i++) {
+                if (ev->bindings[i].virt_counter & counter_bit) {
+                        /* modify ESCR */
+                        ESCR_READ(escr, high, ev, i);
+                        ESCR_CLEAR(escr);
+                        if (stag == 0) {
+                                ESCR_SET_USR_0(escr, counter_config[ctr].user);
+                                ESCR_SET_OS_0(escr, counter_config[ctr].kernel);
+                        } else {
+                                ESCR_SET_USR_1(escr, counter_config[ctr].user);
+                                ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
+                        }
+                        ESCR_SET_EVENT_SELECT(escr, ev->event_select);
+                        ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);                       
+                        ESCR_WRITE(escr, high, ev, i);
+                       
+                        /* modify CCCR */
+                        CCCR_READ(cccr, high, VIRT_CTR(stag, ctr));
+                        CCCR_CLEAR(cccr);
+                        CCCR_SET_REQUIRED_BITS(cccr);
+                        CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
+                        if (stag == 0) {
+                                CCCR_SET_PMI_OVF_0(cccr);
+                        } else {
+                                CCCR_SET_PMI_OVF_1(cccr);
+                        }
+                        CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr));
+                        return;
+                }
+        }
+        printk(KERN_ERR 
+               "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
+               counter_config[ctr].event, stag, ctr);
+}
+static void p4_setup_ctrs(struct op_msrs const * const msrs)
+{
+        unsigned int i;
+        unsigned int low, high;
+        unsigned int stag;
+        stag = get_stagger();
+        rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+        if (! MISC_PMC_ENABLED_P(low)) {
+                printk(KERN_ERR "oprofile: P4 PMC not available\n");
+                return;
+        }
+        /* clear the cccrs we will use */
+        for (i = 0 ; i < num_counters ; i++) {
+                if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
+                        continue;
+                rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+                CCCR_CLEAR(low);
+                CCCR_SET_REQUIRED_BITS(low);
+                wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+        }
+        /* clear all escrs (including those outside our concern) */
+        for (i = num_counters; i < num_controls; i++) {
+                if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
+                        continue;
+                wrmsr(msrs->controls[i].addr, 0, 0);
+        }
+        /* setup all counters */
+        for (i = 0 ; i < num_counters ; ++i) {
+                if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) {
+                        reset_value[i] = counter_config[i].count;
+                        pmc_setup_one_p4_counter(i);
+                        CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i));
+                } else {
+                        reset_value[i] = 0;
+                }
+        }
+}
+static int p4_check_ctrs(struct pt_regs * const regs,
+                         struct op_msrs const * const msrs)
+{
+        unsigned long ctr, low, high, stag, real;
+        int i;
+        stag = get_stagger();
+        for (i = 0; i < num_counters; ++i) {
+                
+                if (!reset_value[i]) 
+                        continue;
+                /* 
+                 * there is some eccentricity in the hardware which
+                 * requires that we perform 2 extra corrections:
+                 *
+                 * - check both the CCCR:OVF flag for overflow and the
+                 *   counter high bit for un-flagged overflows.
+                 *
+                 * - write the counter back twice to ensure it gets
+                 *   updated properly.
+                 * 
+                 * the former seems to be related to extra NMIs happening
+                 * during the current NMI; the latter is reported as errata
+                 * N15 in intel doc 249199-029, pentium 4 specification
+                 * update, though their suggested work-around does not
+                 * appear to solve the problem.
+                 */
+                
+                real = VIRT_CTR(stag, i);
+                CCCR_READ(low, high, real);
+                CTR_READ(ctr, high, real);
+                if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) {
+                        oprofile_add_sample(regs, i);
+                        CTR_WRITE(reset_value[i], real);
+                        CCCR_CLEAR_OVF(low);
+                        CCCR_WRITE(low, high, real);
+                        CTR_WRITE(reset_value[i], real);
+                }
+        }
+        /* P4 quirk: you have to re-unmask the apic vector */
+        apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+        /* See op_model_ppro.c */
+        return 1;
+}
+static void p4_start(struct op_msrs const * const msrs)
+{
+        unsigned int low, high, stag;
+        int i;
+        stag = get_stagger();
+        for (i = 0; i < num_counters; ++i) {
+                if (!reset_value[i])
+                        continue;
+                CCCR_READ(low, high, VIRT_CTR(stag, i));
+                CCCR_SET_ENABLE(low);
+                CCCR_WRITE(low, high, VIRT_CTR(stag, i));
+        }
+}
+static void p4_stop(struct op_msrs const * const msrs)
+{
+        unsigned int low, high, stag;
+        int i;
+        stag = get_stagger();
+        for (i = 0; i < num_counters; ++i) {
+                if (!reset_value[i])
+                        continue;
+                CCCR_READ(low, high, VIRT_CTR(stag, i));
+                CCCR_SET_DISABLE(low);
+                CCCR_WRITE(low, high, VIRT_CTR(stag, i));
+        }
+}
+static void p4_shutdown(struct op_msrs const * const msrs)
+{
+        int i;
+        for (i = 0 ; i < num_counters ; ++i) {
+                if (CTR_IS_RESERVED(msrs,i))
+                        release_perfctr_nmi(msrs->counters[i].addr);
+        }
+        /* some of the control registers are specially reserved in
+         * conjunction with the counter registers (hence the starting offset).
+         * This saves a few bits.
+         */
+        for (i = num_counters ; i < num_controls ; ++i) {
+                if (CTRL_IS_RESERVED(msrs,i))
+                        release_evntsel_nmi(msrs->controls[i].addr);
+        }
+}
+#ifdef CONFIG_SMP
+struct op_x86_model_spec const op_p4_ht2_spec = {
+        .num_counters = NUM_COUNTERS_HT2,
+        .num_controls = NUM_CONTROLS_HT2,
+        .fill_in_addresses = &p4_fill_in_addresses,
+        .setup_ctrs = &p4_setup_ctrs,
+        .check_ctrs = &p4_check_ctrs,
+        .start = &p4_start,
+        .stop = &p4_stop,
+        .shutdown = &p4_shutdown
+};
+#endif
+struct op_x86_model_spec const op_p4_spec = {
+        .num_counters = NUM_COUNTERS_NON_HT,
+        .num_controls = NUM_CONTROLS_NON_HT,
+        .fill_in_addresses = &p4_fill_in_addresses,
+        .setup_ctrs = &p4_setup_ctrs,
+        .check_ctrs = &p4_check_ctrs,
+        .start = &p4_start,
+        .stop = &p4_stop,
+        .shutdown = &p4_shutdown
+};
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
new file mode 100644
index 000000000000..c554f52cb808
--- /dev/null
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -0,0 +1,192 @@
+/**
+ * @file op_model_ppro.h
+ * pentium pro / P6 model-specific MSR operations
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon
+ * @author Philippe Elie
+ * @author Graydon Hoare
+ */
+#include <linux/oprofile.h>
+#include <asm/ptrace.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+ 
+#include "op_x86_model.h"
+#include "op_counter.h"
+#define NUM_COUNTERS 2
+#define NUM_CONTROLS 2
+#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
+#define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0)
+#define CTR_32BIT_WRITE(l,msrs,c)       \
+        do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0);} while (0)
+#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
+#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
+#define CTRL_READ(l,h,msrs,c) do {rdmsr((msrs->controls[(c)].addr), (l), (h));} while (0)
+#define CTRL_WRITE(l,h,msrs,c) do {wrmsr((msrs->controls[(c)].addr), (l), (h));} while (0)
+#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
+#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
+#define CTRL_CLEAR(x) (x &= (1<<21))
+#define CTRL_SET_ENABLE(val) (val |= 1<<20)
+#define CTRL_SET_USR(val,u) (val |= ((u & 1) << 16))
+#define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17))
+#define CTRL_SET_UM(val, m) (val |= (m << 8))
+#define CTRL_SET_EVENT(val, e) (val |= e)
+static unsigned long reset_value[NUM_COUNTERS];
+ 
+static void ppro_fill_in_addresses(struct op_msrs * const msrs)
+{
+        int i;
+        for (i=0; i < NUM_COUNTERS; i++) {
+                if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
+                        msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
+                else
+                        msrs->counters[i].addr = 0;
+        }
+        
+        for (i=0; i < NUM_CONTROLS; i++) {
+                if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
+                        msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
+                else
+                        msrs->controls[i].addr = 0;
+        }
+}
+static void ppro_setup_ctrs(struct op_msrs const * const msrs)
+{
+        unsigned int low, high;
+        int i;
+        /* clear all counters */
+        for (i = 0 ; i < NUM_CONTROLS; ++i) {
+                if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
+                        continue;
+                CTRL_READ(low, high, msrs, i);
+                CTRL_CLEAR(low);
+                CTRL_WRITE(low, high, msrs, i);
+        }
+        
+        /* avoid a false detection of ctr overflows in NMI handler */
+        for (i = 0; i < NUM_COUNTERS; ++i) {
+                if (unlikely(!CTR_IS_RESERVED(msrs,i)))
+                        continue;
+                CTR_32BIT_WRITE(1, msrs, i);
+        }
+        /* enable active counters */
+        for (i = 0; i < NUM_COUNTERS; ++i) {
+                if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) {
+                        reset_value[i] = counter_config[i].count;
+                        CTR_32BIT_WRITE(counter_config[i].count, msrs, i);
+                        CTRL_READ(low, high, msrs, i);
+                        CTRL_CLEAR(low);
+                        CTRL_SET_ENABLE(low);
+                        CTRL_SET_USR(low, counter_config[i].user);
+                        CTRL_SET_KERN(low, counter_config[i].kernel);
+                        CTRL_SET_UM(low, counter_config[i].unit_mask);
+                        CTRL_SET_EVENT(low, counter_config[i].event);
+                        CTRL_WRITE(low, high, msrs, i);
+                } else {
+                        reset_value[i] = 0;
+                }
+        }
+}
+ 
+static int ppro_check_ctrs(struct pt_regs * const regs,
+                           struct op_msrs const * const msrs)
+{
+        unsigned int low, high;
+        int i;
+ 
+        for (i = 0 ; i < NUM_COUNTERS; ++i) {
+                if (!reset_value[i])
+                        continue;
+                CTR_READ(low, high, msrs, i);
+                if (CTR_OVERFLOWED(low)) {
+                        oprofile_add_sample(regs, i);
+                        CTR_32BIT_WRITE(reset_value[i], msrs, i);
+                }
+        }
+        /* Only P6 based Pentium M need to re-unmask the apic vector but it
+         * doesn't hurt other P6 variant */
+        apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+        /* We can't work out if we really handled an interrupt. We
+         * might have caught a *second* counter just after overflowing
+         * the interrupt for this counter then arrives
+         * and we don't find a counter that's overflowed, so we
+         * would return 0 and get dazed + confused. Instead we always
+         * assume we found an overflow. This sucks.
+         */
+        return 1;
+}
+ 
+static void ppro_start(struct op_msrs const * const msrs)
+{
+        unsigned int low,high;
+        int i;
+        for (i = 0; i < NUM_COUNTERS; ++i) {
+                if (reset_value[i]) {
+                        CTRL_READ(low, high, msrs, i);
+                        CTRL_SET_ACTIVE(low);
+                        CTRL_WRITE(low, high, msrs, i);
+                }
+        }
+}
+static void ppro_stop(struct op_msrs const * const msrs)
+{
+        unsigned int low,high;
+        int i;
+        for (i = 0; i < NUM_COUNTERS; ++i) {
+                if (!reset_value[i])
+                        continue;
+                CTRL_READ(low, high, msrs, i);
+                CTRL_SET_INACTIVE(low);
+                CTRL_WRITE(low, high, msrs, i);
+        }
+}
+static void ppro_shutdown(struct op_msrs const * const msrs)
+{
+        int i;
+        for (i = 0 ; i < NUM_COUNTERS ; ++i) {
+                if (CTR_IS_RESERVED(msrs,i))
+                        release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+        }
+        for (i = 0 ; i < NUM_CONTROLS ; ++i) {
+                if (CTRL_IS_RESERVED(msrs,i))
+                        release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
+        }
+}
+struct op_x86_model_spec const op_ppro_spec = {
+        .num_counters = NUM_COUNTERS,
+        .num_controls = NUM_CONTROLS,
+        .fill_in_addresses = &ppro_fill_in_addresses,
+        .setup_ctrs = &ppro_setup_ctrs,
+        .check_ctrs = &ppro_check_ctrs,
+        .start = &ppro_start,
+        .stop = &ppro_stop,
+        .shutdown = &ppro_shutdown
+};
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
new file mode 100644
index 000000000000..abb1aa95b979
--- /dev/null
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -0,0 +1,51 @@
+/**
+ * @file op_x86_model.h
+ * interface to x86 model-specific MSR operations
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author Graydon Hoare
+ */
+#ifndef OP_X86_MODEL_H
+#define OP_X86_MODEL_H
+struct op_saved_msr {
+        unsigned int high;
+        unsigned int low;
+};
+struct op_msr {
+        unsigned long addr;
+        struct op_saved_msr saved;
+};
+struct op_msrs {
+        struct op_msr * counters;
+        struct op_msr * controls;
+};
+struct pt_regs;
+/* The model vtable abstracts the differences between
+ * various x86 CPU model's perfctr support.
+ */
+struct op_x86_model_spec {
+        unsigned int const num_counters;
+        unsigned int const num_controls;
+        void (*fill_in_addresses)(struct op_msrs * const msrs);
+        void (*setup_ctrs)(struct op_msrs const * const msrs);
+        int (*check_ctrs)(struct pt_regs * const regs,
+                struct op_msrs const * const msrs);
+        void (*start)(struct op_msrs const * const msrs);
+        void (*stop)(struct op_msrs const * const msrs);
+        void (*shutdown)(struct op_msrs const * const msrs);
+};
+extern struct op_x86_model_spec const op_ppro_spec;
+extern struct op_x86_model_spec const op_p4_spec;
+extern struct op_x86_model_spec const op_p4_ht2_spec;
+extern struct op_x86_model_spec const op_athlon_spec;
+#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
new file mode 100644
index 000000000000..c5c8e485fc44
--- /dev/null
+++ b/arch/x86/pci/Makefile
@@ -0,0 +1,5 @@
+ifeq ($(CONFIG_X86_32),y)
+include ${srctree}/arch/x86/pci/Makefile_32
+else
+include ${srctree}/arch/x86/pci/Makefile_64
+endif
diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32
new file mode 100644
index 000000000000..cdd6828b5abb
--- /dev/null
+++ b/arch/x86/pci/Makefile_32
@@ -0,0 +1,14 @@
+obj-y                           := i386.o init.o
+obj-$(CONFIG_PCI_BIOS)          += pcbios.o
+obj-$(CONFIG_PCI_MMCONFIG)      += mmconfig_32.o direct.o mmconfig-shared.o
+obj-$(CONFIG_PCI_DIRECT)        += direct.o
+pci-y                           := fixup.o
+pci-$(CONFIG_ACPI)              += acpi.o
+pci-y                           += legacy.o irq.o
+pci-$(CONFIG_X86_VISWS)         := visws.o fixup.o
+pci-$(CONFIG_X86_NUMAQ)         := numa.o irq.o
+obj-y                           += $(pci-y) common.o early.o
diff --git a/arch/x86/pci/Makefile_64 b/arch/x86/pci/Makefile_64
new file mode 100644
index 000000000000..7d8c467bf143
--- /dev/null
+++ b/arch/x86/pci/Makefile_64
@@ -0,0 +1,17 @@
+#
+# Makefile for X86_64 specific PCI routines
+#
+# Reuse the i386 PCI subsystem
+#
+EXTRA_CFLAGS += -Iarch/x86/pci
+obj-y           := i386.o
+obj-$(CONFIG_PCI_DIRECT)+= direct.o
+obj-y           += fixup.o init.o
+obj-$(CONFIG_ACPI)      += acpi.o
+obj-y                   += legacy.o irq.o common.o early.o
+# mmconfig has a 64bit special
+obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_64.o direct.o mmconfig-shared.o
+obj-$(CONFIG_NUMA)      += k8-bus_64.o
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
new file mode 100644
index 000000000000..bc8a44bddaa7
--- /dev/null
+++ b/arch/x86/pci/acpi.c
@@ -0,0 +1,90 @@
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <asm/numa.h>
+#include "pci.h"
+struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum)
+{
+        struct pci_bus *bus;
+        struct pci_sysdata *sd;
+        int pxm;
+        /* Allocate per-root-bus (not per bus) arch-specific data.
+         * TODO: leak; this memory is never freed.
+         * It's arguable whether it's worth the trouble to care.
+         */
+        sd = kzalloc(sizeof(*sd), GFP_KERNEL);
+        if (!sd) {
+                printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum);
+                return NULL;
+        }
+        if (domain != 0) {
+                printk(KERN_WARNING "PCI: Multiple domains not supported\n");
+                kfree(sd);
+                return NULL;
+        }
+        sd->node = -1;
+        pxm = acpi_get_pxm(device->handle);
+#ifdef CONFIG_ACPI_NUMA
+        if (pxm >= 0)
+                sd->node = pxm_to_node(pxm);
+#endif
+        bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
+        if (!bus)
+                kfree(sd);
+#ifdef CONFIG_ACPI_NUMA
+        if (bus != NULL) {
+                if (pxm >= 0) {
+                        printk("bus %d -> pxm %d -> node %d\n",
+                                busnum, pxm, sd->node);
+                }
+        }
+#endif
+        
+        return bus;
+}
+extern int pci_routeirq;
+static int __init pci_acpi_init(void)
+{
+        struct pci_dev *dev = NULL;
+        if (pcibios_scanned)
+                return 0;
+        if (acpi_noirq)
+                return 0;
+        printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n");
+        acpi_irq_penalty_init();
+        pcibios_scanned++;
+        pcibios_enable_irq = acpi_pci_irq_enable;
+        pcibios_disable_irq = acpi_pci_irq_disable;
+        if (pci_routeirq) {
+                /*
+                 * PCI IRQ routing is set up by pci_enable_device(), but we
+                 * also do it here in case there are still broken drivers that
+                 * don't use pci_enable_device().
+                 */
+                printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
+                for_each_pci_dev(dev)
+                        acpi_pci_irq_enable(dev);
+        } else
+                printk(KERN_INFO "PCI: If a device doesn't work, try \"pci=routeirq\".  If it helps, post a report\n");
+#ifdef CONFIG_X86_IO_APIC
+        if (acpi_ioapic)
+                print_IO_APIC();
+#endif
+        return 0;
+}
+subsys_initcall(pci_acpi_init);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
new file mode 100644
index 000000000000..07d5223442bf
--- /dev/null
+++ b/arch/x86/pci/common.c
@@ -0,0 +1,480 @@
+/*
+ *      Low-Level PCI Support for PC
+ *
+ *      (c) 1999--2000 Martin Mares <mj@ucw.cz>
+ */
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <asm/acpi.h>
+#include <asm/segment.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include "pci.h"
+unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
+                                PCI_PROBE_MMCONF;
+static int pci_bf_sort;
+int pci_routeirq;
+int pcibios_last_bus = -1;
+unsigned long pirq_table_addr;
+struct pci_bus *pci_root_bus;
+struct pci_raw_ops *raw_pci_ops;
+static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value)
+{
+        return raw_pci_ops->read(0, bus->number, devfn, where, size, value);
+}
+static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 value)
+{
+        return raw_pci_ops->write(0, bus->number, devfn, where, size, value);
+}
+struct pci_ops pci_root_ops = {
+        .read = pci_read,
+        .write = pci_write,
+};
+/*
+ * legacy, numa, and acpi all want to call pcibios_scan_root
+ * from their initcalls. This flag prevents that.
+ */
+int pcibios_scanned;
+/*
+ * This interrupt-safe spinlock protects all accesses to PCI
+ * configuration space.
+ */
+DEFINE_SPINLOCK(pci_config_lock);
+/*
+ * Several buggy motherboards address only 16 devices and mirror
+ * them to next 16 IDs. We try to detect this `feature' on all
+ * primary buses (those containing host bridges as they are
+ * expected to be unique) and remove the ghost devices.
+ */
+static void __devinit pcibios_fixup_ghosts(struct pci_bus *b)
+{
+        struct list_head *ln, *mn;
+        struct pci_dev *d, *e;
+        int mirror = PCI_DEVFN(16,0);
+        int seen_host_bridge = 0;
+        int i;
+        DBG("PCI: Scanning for ghost devices on bus %d\n", b->number);
+        list_for_each(ln, &b->devices) {
+                d = pci_dev_b(ln);
+                if ((d->class >> 8) == PCI_CLASS_BRIDGE_HOST)
+                        seen_host_bridge++;
+                for (mn=ln->next; mn != &b->devices; mn=mn->next) {
+                        e = pci_dev_b(mn);
+                        if (e->devfn != d->devfn + mirror ||
+                            e->vendor != d->vendor ||
+                            e->device != d->device ||
+                            e->class != d->class)
+                                continue;
+                        for(i=0; i<PCI_NUM_RESOURCES; i++)
+                                if (e->resource[i].start != d->resource[i].start ||
+                                    e->resource[i].end != d->resource[i].end ||
+                                    e->resource[i].flags != d->resource[i].flags)
+                                        continue;
+                        break;
+                }
+                if (mn == &b->devices)
+                        return;
+        }
+        if (!seen_host_bridge)
+                return;
+        printk(KERN_WARNING "PCI: Ignoring ghost devices on bus %02x\n", b->number);
+        ln = &b->devices;
+        while (ln->next != &b->devices) {
+                d = pci_dev_b(ln->next);
+                if (d->devfn >= mirror) {
+                        list_del(&d->global_list);
+                        list_del(&d->bus_list);
+                        kfree(d);
+                } else
+                        ln = ln->next;
+        }
+}
+/*
+ *  Called after each bus is probed, but before its children
+ *  are examined.
+ */
+void __devinit  pcibios_fixup_bus(struct pci_bus *b)
+{
+        pcibios_fixup_ghosts(b);
+        pci_read_bridge_bases(b);
+}
+/*
+ * Only use DMI information to set this if nothing was passed
+ * on the kernel command line (which was parsed earlier).
+ */
+static int __devinit set_bf_sort(const struct dmi_system_id *d)
+{
+        if (pci_bf_sort == pci_bf_sort_default) {
+                pci_bf_sort = pci_dmi_bf;
+                printk(KERN_INFO "PCI: %s detected, enabling pci=bfsort.\n", d->ident);
+        }
+        return 0;
+}
+/*
+ * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
+ */
+#ifdef __i386__
+static int __devinit assign_all_busses(const struct dmi_system_id *d)
+{
+        pci_probe |= PCI_ASSIGN_ALL_BUSSES;
+        printk(KERN_INFO "%s detected: enabling PCI bus# renumbering"
+                        " (pci=assign-busses)\n", d->ident);
+        return 0;
+}
+#endif
+static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
+#ifdef __i386__
+/*
+ * Laptops which need pci=assign-busses to see Cardbus cards
+ */
+        {
+                .callback = assign_all_busses,
+                .ident = "Samsung X20 Laptop",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Samsung Electronics"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "SX20S"),
+                },
+        },
+#endif          /* __i386__ */
+        {
+                .callback = set_bf_sort,
+                .ident = "Dell PowerEdge 1950",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1950"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "Dell PowerEdge 1955",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1955"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "Dell PowerEdge 2900",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2900"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "Dell PowerEdge 2950",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2950"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "Dell PowerEdge R900",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R900"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "HP ProLiant BL20p G3",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL20p G3"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "HP ProLiant BL20p G4",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL20p G4"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "HP ProLiant BL30p G1",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL30p G1"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "HP ProLiant BL25p G1",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL25p G1"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "HP ProLiant BL35p G1",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL35p G1"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "HP ProLiant BL45p G1",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL45p G1"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "HP ProLiant BL45p G2",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL45p G2"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "HP ProLiant BL460c G1",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL460c G1"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "HP ProLiant BL465c G1",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL465c G1"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "HP ProLiant BL480c G1",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL480c G1"),
+                },
+        },
+        {
+                .callback = set_bf_sort,
+                .ident = "HP ProLiant BL685c G1",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL685c G1"),
+                },
+        },
+        {}
+};
+struct pci_bus * __devinit pcibios_scan_root(int busnum)
+{
+        struct pci_bus *bus = NULL;
+        struct pci_sysdata *sd;
+        dmi_check_system(pciprobe_dmi_table);
+        while ((bus = pci_find_next_bus(bus)) != NULL) {
+                if (bus->number == busnum) {
+                        /* Already scanned */
+                        return bus;
+                }
+        }
+        /* Allocate per-root-bus (not per bus) arch-specific data.
+         * TODO: leak; this memory is never freed.
+         * It's arguable whether it's worth the trouble to care.
+         */
+        sd = kzalloc(sizeof(*sd), GFP_KERNEL);
+        if (!sd) {
+                printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum);
+                return NULL;
+        }
+        printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
+        return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
+}
+extern u8 pci_cache_line_size;
+static int __init pcibios_init(void)
+{
+        struct cpuinfo_x86 *c = &boot_cpu_data;
+        if (!raw_pci_ops) {
+                printk(KERN_WARNING "PCI: System does not support PCI\n");
+                return 0;
+        }
+        /*
+         * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8
+         * and P4. It's also good for 386/486s (which actually have 16)
+         * as quite a few PCI devices do not support smaller values.
+         */
+        pci_cache_line_size = 32 >> 2;
+        if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
+                pci_cache_line_size = 64 >> 2;  /* K7 & K8 */
+        else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
+                pci_cache_line_size = 128 >> 2; /* P4 */
+        pcibios_resource_survey();
+        if (pci_bf_sort >= pci_force_bf)
+                pci_sort_breadthfirst();
+#ifdef CONFIG_PCI_BIOS
+        if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT))
+                pcibios_sort();
+#endif
+        return 0;
+}
+subsys_initcall(pcibios_init);
+char * __devinit  pcibios_setup(char *str)
+{
+        if (!strcmp(str, "off")) {
+                pci_probe = 0;
+                return NULL;
+        } else if (!strcmp(str, "bfsort")) {
+                pci_bf_sort = pci_force_bf;
+                return NULL;
+        } else if (!strcmp(str, "nobfsort")) {
+                pci_bf_sort = pci_force_nobf;
+                return NULL;
+        }
+#ifdef CONFIG_PCI_BIOS
+        else if (!strcmp(str, "bios")) {
+                pci_probe = PCI_PROBE_BIOS;
+                return NULL;
+        } else if (!strcmp(str, "nobios")) {
+                pci_probe &= ~PCI_PROBE_BIOS;
+                return NULL;
+        } else if (!strcmp(str, "nosort")) {
+                pci_probe |= PCI_NO_SORT;
+                return NULL;
+        } else if (!strcmp(str, "biosirq")) {
+                pci_probe |= PCI_BIOS_IRQ_SCAN;
+                return NULL;
+        } else if (!strncmp(str, "pirqaddr=", 9)) {
+                pirq_table_addr = simple_strtoul(str+9, NULL, 0);
+                return NULL;
+        }
+#endif
+#ifdef CONFIG_PCI_DIRECT
+        else if (!strcmp(str, "conf1")) {
+                pci_probe = PCI_PROBE_CONF1 | PCI_NO_CHECKS;
+                return NULL;
+        }
+        else if (!strcmp(str, "conf2")) {
+                pci_probe = PCI_PROBE_CONF2 | PCI_NO_CHECKS;
+                return NULL;
+        }
+#endif
+#ifdef CONFIG_PCI_MMCONFIG
+        else if (!strcmp(str, "nommconf")) {
+                pci_probe &= ~PCI_PROBE_MMCONF;
+                return NULL;
+        }
+#endif
+        else if (!strcmp(str, "noacpi")) {
+                acpi_noirq_set();
+                return NULL;
+        }
+        else if (!strcmp(str, "noearly")) {
+                pci_probe |= PCI_PROBE_NOEARLY;
+                return NULL;
+        }
+#ifndef CONFIG_X86_VISWS
+        else if (!strcmp(str, "usepirqmask")) {
+                pci_probe |= PCI_USE_PIRQ_MASK;
+                return NULL;
+        } else if (!strncmp(str, "irqmask=", 8)) {
+                pcibios_irq_mask = simple_strtol(str+8, NULL, 0);
+                return NULL;
+        } else if (!strncmp(str, "lastbus=", 8)) {
+                pcibios_last_bus = simple_strtol(str+8, NULL, 0);
+                return NULL;
+        }
+#endif
+        else if (!strcmp(str, "rom")) {
+                pci_probe |= PCI_ASSIGN_ROMS;
+                return NULL;
+        } else if (!strcmp(str, "assign-busses")) {
+                pci_probe |= PCI_ASSIGN_ALL_BUSSES;
+                return NULL;
+        } else if (!strcmp(str, "routeirq")) {
+                pci_routeirq = 1;
+                return NULL;
+        }
+        return str;
+}
+unsigned int pcibios_assign_all_busses(void)
+{
+        return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0;
+}
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+        int err;
+        if ((err = pcibios_enable_resources(dev, mask)) < 0)
+                return err;
+        if (!dev->msi_enabled)
+                return pcibios_enable_irq(dev);
+        return 0;
+}
+void pcibios_disable_device (struct pci_dev *dev)
+{
+        if (!dev->msi_enabled && pcibios_disable_irq)
+                pcibios_disable_irq(dev);
+}
+struct pci_bus *pci_scan_bus_with_sysdata(int busno)
+{
+        struct pci_bus *bus = NULL;
+        struct pci_sysdata *sd;
+        /*
+         * Allocate per-root-bus (not per bus) arch-specific data.
+         * TODO: leak; this memory is never freed.
+         * It's arguable whether it's worth the trouble to care.
+         */
+        sd = kzalloc(sizeof(*sd), GFP_KERNEL);
+        if (!sd) {
+                printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busno);
+                return NULL;
+        }
+        sd->node = -1;
+        bus = pci_scan_bus(busno, &pci_root_ops, sd);
+        if (!bus)
+                kfree(sd);
+        return bus;
+}
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
new file mode 100644
index 000000000000..431c9a51b157
--- /dev/null
+++ b/arch/x86/pci/direct.c
@@ -0,0 +1,302 @@
+/*
+ * direct.c - Low-level direct PCI config space access
+ */
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include "pci.h"
+/*
+ * Functions for accessing PCI configuration space with type 1 accesses
+ */
+#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
+        (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
+int pci_conf1_read(unsigned int seg, unsigned int bus,
+                          unsigned int devfn, int reg, int len, u32 *value)
+{
+        unsigned long flags;
+        if ((bus > 255) || (devfn > 255) || (reg > 255)) {
+                *value = -1;
+                return -EINVAL;
+        }
+        spin_lock_irqsave(&pci_config_lock, flags);
+        outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
+        switch (len) {
+        case 1:
+                *value = inb(0xCFC + (reg & 3));
+                break;
+        case 2:
+                *value = inw(0xCFC + (reg & 2));
+                break;
+        case 4:
+                *value = inl(0xCFC);
+                break;
+        }
+        spin_unlock_irqrestore(&pci_config_lock, flags);
+        return 0;
+}
+int pci_conf1_write(unsigned int seg, unsigned int bus,
+                           unsigned int devfn, int reg, int len, u32 value)
+{
+        unsigned long flags;
+        if ((bus > 255) || (devfn > 255) || (reg > 255)) 
+                return -EINVAL;
+        spin_lock_irqsave(&pci_config_lock, flags);
+        outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
+        switch (len) {
+        case 1:
+                outb((u8)value, 0xCFC + (reg & 3));
+                break;
+        case 2:
+                outw((u16)value, 0xCFC + (reg & 2));
+                break;
+        case 4:
+                outl((u32)value, 0xCFC);
+                break;
+        }
+        spin_unlock_irqrestore(&pci_config_lock, flags);
+        return 0;
+}
+#undef PCI_CONF1_ADDRESS
+struct pci_raw_ops pci_direct_conf1 = {
+        .read =         pci_conf1_read,
+        .write =        pci_conf1_write,
+};
+/*
+ * Functions for accessing PCI configuration space with type 2 accesses
+ */
+#define PCI_CONF2_ADDRESS(dev, reg)     (u16)(0xC000 | (dev << 8) | reg)
+static int pci_conf2_read(unsigned int seg, unsigned int bus,
+                          unsigned int devfn, int reg, int len, u32 *value)
+{
+        unsigned long flags;
+        int dev, fn;
+        if ((bus > 255) || (devfn > 255) || (reg > 255)) {
+                *value = -1;
+                return -EINVAL;
+        }
+        dev = PCI_SLOT(devfn);
+        fn = PCI_FUNC(devfn);
+        if (dev & 0x10) 
+                return PCIBIOS_DEVICE_NOT_FOUND;
+        spin_lock_irqsave(&pci_config_lock, flags);
+        outb((u8)(0xF0 | (fn << 1)), 0xCF8);
+        outb((u8)bus, 0xCFA);
+        switch (len) {
+        case 1:
+                *value = inb(PCI_CONF2_ADDRESS(dev, reg));
+                break;
+        case 2:
+                *value = inw(PCI_CONF2_ADDRESS(dev, reg));
+                break;
+        case 4:
+                *value = inl(PCI_CONF2_ADDRESS(dev, reg));
+                break;
+        }
+        outb(0, 0xCF8);
+        spin_unlock_irqrestore(&pci_config_lock, flags);
+        return 0;
+}
+static int pci_conf2_write(unsigned int seg, unsigned int bus,
+                           unsigned int devfn, int reg, int len, u32 value)
+{
+        unsigned long flags;
+        int dev, fn;
+        if ((bus > 255) || (devfn > 255) || (reg > 255)) 
+                return -EINVAL;
+        dev = PCI_SLOT(devfn);
+        fn = PCI_FUNC(devfn);
+        if (dev & 0x10) 
+                return PCIBIOS_DEVICE_NOT_FOUND;
+        spin_lock_irqsave(&pci_config_lock, flags);
+        outb((u8)(0xF0 | (fn << 1)), 0xCF8);
+        outb((u8)bus, 0xCFA);
+        switch (len) {
+        case 1:
+                outb((u8)value, PCI_CONF2_ADDRESS(dev, reg));
+                break;
+        case 2:
+                outw((u16)value, PCI_CONF2_ADDRESS(dev, reg));
+                break;
+        case 4:
+                outl((u32)value, PCI_CONF2_ADDRESS(dev, reg));
+                break;
+        }
+        outb(0, 0xCF8);    
+        spin_unlock_irqrestore(&pci_config_lock, flags);
+        return 0;
+}
+#undef PCI_CONF2_ADDRESS
+static struct pci_raw_ops pci_direct_conf2 = {
+        .read =         pci_conf2_read,
+        .write =        pci_conf2_write,
+};
+/*
+ * Before we decide to use direct hardware access mechanisms, we try to do some
+ * trivial checks to ensure it at least _seems_ to be working -- we just test
+ * whether bus 00 contains a host bridge (this is similar to checking
+ * techniques used in XFree86, but ours should be more reliable since we
+ * attempt to make use of direct access hints provided by the PCI BIOS).
+ *
+ * This should be close to trivial, but it isn't, because there are buggy
+ * chipsets (yes, you guessed it, by Intel and Compaq) that have no class ID.
+ */
+static int __init pci_sanity_check(struct pci_raw_ops *o)
+{
+        u32 x = 0;
+        int devfn;
+        if (pci_probe & PCI_NO_CHECKS)
+                return 1;
+        /* Assume Type 1 works for newer systems.
+           This handles machines that don't have anything on PCI Bus 0. */
+        if (dmi_get_year(DMI_BIOS_DATE) >= 2001)
+                return 1;
+        for (devfn = 0; devfn < 0x100; devfn++) {
+                if (o->read(0, 0, devfn, PCI_CLASS_DEVICE, 2, &x))
+                        continue;
+                if (x == PCI_CLASS_BRIDGE_HOST || x == PCI_CLASS_DISPLAY_VGA)
+                        return 1;
+                if (o->read(0, 0, devfn, PCI_VENDOR_ID, 2, &x))
+                        continue;
+                if (x == PCI_VENDOR_ID_INTEL || x == PCI_VENDOR_ID_COMPAQ)
+                        return 1;
+        }
+        DBG(KERN_WARNING "PCI: Sanity check failed\n");
+        return 0;
+}
+static int __init pci_check_type1(void)
+{
+        unsigned long flags;
+        unsigned int tmp;
+        int works = 0;
+        local_irq_save(flags);
+        outb(0x01, 0xCFB);
+        tmp = inl(0xCF8);
+        outl(0x80000000, 0xCF8);
+        if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) {
+                works = 1;
+        }
+        outl(tmp, 0xCF8);
+        local_irq_restore(flags);
+        return works;
+}
+static int __init pci_check_type2(void)
+{
+        unsigned long flags;
+        int works = 0;
+        local_irq_save(flags);
+        outb(0x00, 0xCFB);
+        outb(0x00, 0xCF8);
+        outb(0x00, 0xCFA);
+        if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 &&
+            pci_sanity_check(&pci_direct_conf2)) {
+                works = 1;
+        }
+        local_irq_restore(flags);
+        return works;
+}
+void __init pci_direct_init(int type)
+{
+        if (type == 0)
+                return;
+        printk(KERN_INFO "PCI: Using configuration type %d\n", type);
+        if (type == 1)
+                raw_pci_ops = &pci_direct_conf1;
+        else
+                raw_pci_ops = &pci_direct_conf2;
+}
+int __init pci_direct_probe(void)
+{
+        struct resource *region, *region2;
+        if ((pci_probe & PCI_PROBE_CONF1) == 0)
+                goto type2;
+        region = request_region(0xCF8, 8, "PCI conf1");
+        if (!region)
+                goto type2;
+        if (pci_check_type1())
+                return 1;
+        release_resource(region);
+ type2:
+        if ((pci_probe & PCI_PROBE_CONF2) == 0)
+                return 0;
+        region = request_region(0xCF8, 4, "PCI conf2");
+        if (!region)
+                return 0;
+        region2 = request_region(0xC000, 0x1000, "PCI conf2");
+        if (!region2)
+                goto fail2;
+        if (pci_check_type2()) {
+                printk(KERN_INFO "PCI: Using configuration type 2\n");
+                raw_pci_ops = &pci_direct_conf2;
+                return 2;
+        }
+        release_resource(region2);
+ fail2:
+        release_resource(region);
+        return 0;
+}
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
new file mode 100644
index 000000000000..42df4b6606df
--- /dev/null
+++ b/arch/x86/pci/early.c
@@ -0,0 +1,59 @@
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+#include <asm/io.h>
+#include "pci.h"
+/* Direct PCI access. This is used for PCI accesses in early boot before
+   the PCI subsystem works. */
+#define PDprintk(x...)
+u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
+{
+        u32 v;
+        outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+        v = inl(0xcfc);
+        if (v != 0xffffffff)
+                PDprintk("%x reading 4 from %x: %x\n", slot, offset, v);
+        return v;
+}
+u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
+{
+        u8 v;
+        outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+        v = inb(0xcfc + (offset&3));
+        PDprintk("%x reading 1 from %x: %x\n", slot, offset, v);
+        return v;
+}
+u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
+{
+        u16 v;
+        outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+        v = inw(0xcfc + (offset&2));
+        PDprintk("%x reading 2 from %x: %x\n", slot, offset, v);
+        return v;
+}
+void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
+                                    u32 val)
+{
+        PDprintk("%x writing to %x: %x\n", slot, offset, val);
+        outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+        outl(val, 0xcfc);
+}
+void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
+{
+        PDprintk("%x writing to %x: %x\n", slot, offset, val);
+        outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+        outb(val, 0xcfc);
+}
+int early_pci_allowed(void)
+{
+        return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) ==
+                        PCI_PROBE_CONF1;
+}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
new file mode 100644
index 000000000000..c82cbf4c7226
--- /dev/null
+++ b/arch/x86/pci/fixup.c
@@ -0,0 +1,446 @@
+/*
+ * Exceptions for specific devices. Usually work-arounds for fatal design flaws.
+ */
+#include <linux/delay.h>
+#include <linux/dmi.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include "pci.h"
+static void __devinit pci_fixup_i450nx(struct pci_dev *d)
+{
+        /*
+         * i450NX -- Find and scan all secondary buses on all PXB's.
+         */
+        int pxb, reg;
+        u8 busno, suba, subb;
+        printk(KERN_WARNING "PCI: Searching for i450NX host bridges on %s\n", pci_name(d));
+        reg = 0xd0;
+        for(pxb=0; pxb<2; pxb++) {
+                pci_read_config_byte(d, reg++, &busno);
+                pci_read_config_byte(d, reg++, &suba);
+                pci_read_config_byte(d, reg++, &subb);
+                DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb);
+                if (busno)
+                        pci_scan_bus_with_sysdata(busno);       /* Bus A */
+                if (suba < subb)
+                        pci_scan_bus_with_sysdata(suba+1);      /* Bus B */
+        }
+        pcibios_last_bus = -1;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
+static void __devinit pci_fixup_i450gx(struct pci_dev *d)
+{
+        /*
+         * i450GX and i450KX -- Find and scan all secondary buses.
+         * (called separately for each PCI bridge found)
+         */
+        u8 busno;
+        pci_read_config_byte(d, 0x4a, &busno);
+        printk(KERN_INFO "PCI: i440KX/GX host bridge %s: secondary bus %02x\n", pci_name(d), busno);
+        pci_scan_bus_with_sysdata(busno);
+        pcibios_last_bus = -1;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454GX, pci_fixup_i450gx);
+static void __devinit  pci_fixup_umc_ide(struct pci_dev *d)
+{
+        /*
+         * UM8886BF IDE controller sets region type bits incorrectly,
+         * therefore they look like memory despite of them being I/O.
+         */
+        int i;
+        printk(KERN_WARNING "PCI: Fixing base address flags for device %s\n", pci_name(d));
+        for(i=0; i<4; i++)
+                d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide);
+static void __devinit  pci_fixup_ncr53c810(struct pci_dev *d)
+{
+        /*
+         * NCR 53C810 returns class code 0 (at least on some systems).
+         * Fix class to be PCI_CLASS_STORAGE_SCSI
+         */
+        if (!d->class) {
+                printk(KERN_WARNING "PCI: fixing NCR 53C810 class code for %s\n", pci_name(d));
+                d->class = PCI_CLASS_STORAGE_SCSI << 8;
+        }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C810, pci_fixup_ncr53c810);
+static void __devinit  pci_fixup_latency(struct pci_dev *d)
+{
+        /*
+         *  SiS 5597 and 5598 chipsets require latency timer set to
+         *  at most 32 to avoid lockups.
+         */
+        DBG("PCI: Setting max latency to 32\n");
+        pcibios_max_latency = 32;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5597, pci_fixup_latency);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5598, pci_fixup_latency);
+static void __devinit pci_fixup_piix4_acpi(struct pci_dev *d)
+{
+        /*
+         * PIIX4 ACPI device: hardwired IRQ9
+         */
+        d->irq = 9;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3, pci_fixup_piix4_acpi);
+/*
+ * Addresses issues with problems in the memory write queue timer in
+ * certain VIA Northbridges.  This bugfix is per VIA's specifications,
+ * except for the KL133/KM133: clearing bit 5 on those Northbridges seems
+ * to trigger a bug in its integrated ProSavage video card, which
+ * causes screen corruption.  We only clear bits 6 and 7 for that chipset,
+ * until VIA can provide us with definitive information on why screen
+ * corruption occurs, and what exactly those bits do.
+ *
+ * VIA 8363,8622,8361 Northbridges:
+ *  - bits  5, 6, 7 at offset 0x55 need to be turned off
+ * VIA 8367 (KT266x) Northbridges:
+ *  - bits  5, 6, 7 at offset 0x95 need to be turned off
+ * VIA 8363 rev 0x81/0x84 (KL133/KM133) Northbridges:
+ *  - bits     6, 7 at offset 0x55 need to be turned off
+ */
+#define VIA_8363_KL133_REVISION_ID 0x81
+#define VIA_8363_KM133_REVISION_ID 0x84
+static void pci_fixup_via_northbridge_bug(struct pci_dev *d)
+{
+        u8 v;
+        int where = 0x55;
+        int mask = 0x1f; /* clear bits 5, 6, 7 by default */
+        if (d->device == PCI_DEVICE_ID_VIA_8367_0) {
+                /* fix pci bus latency issues resulted by NB bios error
+                   it appears on bug free^Wreduced kt266x's bios forces
+                   NB latency to zero */
+                pci_write_config_byte(d, PCI_LATENCY_TIMER, 0);
+                where = 0x95; /* the memory write queue timer register is 
+                                different for the KT266x's: 0x95 not 0x55 */
+        } else if (d->device == PCI_DEVICE_ID_VIA_8363_0 &&
+                        (d->revision == VIA_8363_KL133_REVISION_ID ||
+                        d->revision == VIA_8363_KM133_REVISION_ID)) {
+                        mask = 0x3f; /* clear only bits 6 and 7; clearing bit 5
+                                        causes screen corruption on the KL133/KM133 */
+        }
+        pci_read_config_byte(d, where, &v);
+        if (v & ~mask) {
+                printk(KERN_WARNING "Disabling VIA memory write queue (PCI ID %04x, rev %02x): [%02x] %02x & %02x -> %02x\n", \
+                        d->device, d->revision, where, v, mask, v & mask);
+                v &= mask;
+                pci_write_config_byte(d, where, v);
+        }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8363_0, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8622, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8361, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8367_0, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8363_0, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8622, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8361, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8367_0, pci_fixup_via_northbridge_bug);
+/*
+ * For some reasons Intel decided that certain parts of their
+ * 815, 845 and some other chipsets must look like PCI-to-PCI bridges
+ * while they are obviously not. The 82801 family (AA, AB, BAM/CAM,
+ * BA/CA/DB and E) PCI bridges are actually HUB-to-PCI ones, according
+ * to Intel terminology. These devices do forward all addresses from
+ * system to PCI bus no matter what are their window settings, so they are
+ * "transparent" (or subtractive decoding) from programmers point of view.
+ */
+static void __devinit pci_fixup_transparent_bridge(struct pci_dev *dev)
+{
+        if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI &&
+            (dev->device & 0xff00) == 0x2400)
+                dev->transparent = 1;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixup_transparent_bridge);
+/*
+ * Fixup for C1 Halt Disconnect problem on nForce2 systems.
+ *
+ * From information provided by "Allen Martin" <AMartin@nvidia.com>:
+ *
+ * A hang is caused when the CPU generates a very fast CONNECT/HALT cycle
+ * sequence.  Workaround is to set the SYSTEM_IDLE_TIMEOUT to 80 ns.
+ * This allows the state-machine and timer to return to a proper state within
+ * 80 ns of the CONNECT and probe appearing together.  Since the CPU will not
+ * issue another HALT within 80 ns of the initial HALT, the failure condition
+ * is avoided.
+ */
+static void pci_fixup_nforce2(struct pci_dev *dev)
+{
+        u32 val;
+        /*
+         * Chip  Old value   New value
+         * C17   0x1F0FFF01  0x1F01FF01
+         * C18D  0x9F0FFF01  0x9F01FF01
+         *
+         * Northbridge chip version may be determined by
+         * reading the PCI revision ID (0xC1 or greater is C18D).
+         */
+        pci_read_config_dword(dev, 0x6c, &val);
+        /*
+         * Apply fixup if needed, but don't touch disconnect state
+         */
+        if ((val & 0x00FF0000) != 0x00010000) {
+                printk(KERN_WARNING "PCI: nForce2 C1 Halt Disconnect fixup\n");
+                pci_write_config_dword(dev, 0x6c, (val & 0xFF00FFFF) | 0x00010000);
+        }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE2, pci_fixup_nforce2);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE2, pci_fixup_nforce2);
+/* Max PCI Express root ports */
+#define MAX_PCIEROOT    6
+static int quirk_aspm_offset[MAX_PCIEROOT << 3];
+#define GET_INDEX(a, b) ((((a) - PCI_DEVICE_ID_INTEL_MCH_PA) << 3) + ((b) & 7))
+static int quirk_pcie_aspm_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value)
+{
+        return raw_pci_ops->read(0, bus->number, devfn, where, size, value);
+}
+/*
+ * Replace the original pci bus ops for write with a new one that will filter
+ * the request to insure ASPM cannot be enabled.
+ */
+static int quirk_pcie_aspm_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 value)
+{
+        u8 offset;
+        offset = quirk_aspm_offset[GET_INDEX(bus->self->device, devfn)];
+        if ((offset) && (where == offset))
+                value = value & 0xfffffffc;
+        
+        return raw_pci_ops->write(0, bus->number, devfn, where, size, value);
+}
+static struct pci_ops quirk_pcie_aspm_ops = {
+        .read = quirk_pcie_aspm_read,
+        .write = quirk_pcie_aspm_write,
+};
+/*
+ * Prevents PCI Express ASPM (Active State Power Management) being enabled.
+ *
+ * Save the register offset, where the ASPM control bits are located,
+ * for each PCI Express device that is in the device list of
+ * the root port in an array for fast indexing. Replace the bus ops
+ * with the modified one.
+ */
+static void pcie_rootport_aspm_quirk(struct pci_dev *pdev)
+{
+        int cap_base, i;
+        struct pci_bus  *pbus;
+        struct pci_dev *dev;
+        if ((pbus = pdev->subordinate) == NULL)
+                return;
+        /*
+         * Check if the DID of pdev matches one of the six root ports. This
+         * check is needed in the case this function is called directly by the
+         * hot-plug driver.
+         */
+        if ((pdev->device < PCI_DEVICE_ID_INTEL_MCH_PA) ||
+            (pdev->device > PCI_DEVICE_ID_INTEL_MCH_PC1))
+                return;
+        if (list_empty(&pbus->devices)) {
+                /*
+                 * If no device is attached to the root port at power-up or
+                 * after hot-remove, the pbus->devices is empty and this code
+                 * will set the offsets to zero and the bus ops to parent's bus
+                 * ops, which is unmodified.
+                 */
+                for (i= GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i)
+                        quirk_aspm_offset[i] = 0;
+                pbus->ops = pbus->parent->ops;
+        } else {
+                /*
+                 * If devices are attached to the root port at power-up or
+                 * after hot-add, the code loops through the device list of
+                 * each root port to save the register offsets and replace the
+                 * bus ops.
+                 */
+                list_for_each_entry(dev, &pbus->devices, bus_list) {
+                        /* There are 0 to 8 devices attached to this bus */
+                        cap_base = pci_find_capability(dev, PCI_CAP_ID_EXP);
+                        quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)]= cap_base + 0x10;
+                }
+                pbus->ops = &quirk_pcie_aspm_ops;
+        }
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,    PCI_DEVICE_ID_INTEL_MCH_PA,     pcie_rootport_aspm_quirk );
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,    PCI_DEVICE_ID_INTEL_MCH_PA1,    pcie_rootport_aspm_quirk );
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,    PCI_DEVICE_ID_INTEL_MCH_PB,     pcie_rootport_aspm_quirk );
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,    PCI_DEVICE_ID_INTEL_MCH_PB1,    pcie_rootport_aspm_quirk );
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,    PCI_DEVICE_ID_INTEL_MCH_PC,     pcie_rootport_aspm_quirk );
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,    PCI_DEVICE_ID_INTEL_MCH_PC1,    pcie_rootport_aspm_quirk );
+/*
+ * Fixup to mark boot BIOS video selected by BIOS before it changes
+ *
+ * From information provided by "Jon Smirl" <jonsmirl@gmail.com>
+ *
+ * The standard boot ROM sequence for an x86 machine uses the BIOS
+ * to select an initial video card for boot display. This boot video
+ * card will have it's BIOS copied to C0000 in system RAM.
+ * IORESOURCE_ROM_SHADOW is used to associate the boot video
+ * card with this copy. On laptops this copy has to be used since
+ * the main ROM may be compressed or combined with another image.
+ * See pci_map_rom() for use of this flag. IORESOURCE_ROM_SHADOW
+ * is marked here since the boot video device will be the only enabled
+ * video device at this point.
+ */
+static void __devinit pci_fixup_video(struct pci_dev *pdev)
+{
+        struct pci_dev *bridge;
+        struct pci_bus *bus;
+        u16 config;
+        if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
+                return;
+        /* Is VGA routed to us? */
+        bus = pdev->bus;
+        while (bus) {
+                bridge = bus->self;
+                /*
+                 * From information provided by
+                 * "David Miller" <davem@davemloft.net>
+                 * The bridge control register is valid for PCI header
+                 * type BRIDGE, or CARDBUS. Host to PCI controllers use
+                 * PCI header type NORMAL.
+                 */
+                if (bridge
+                    &&((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+                       ||(bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) {
+                        pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
+                                                &config);
+                        if (!(config & PCI_BRIDGE_CTL_VGA))
+                                return;
+                }
+                bus = bus->parent;
+        }
+        pci_read_config_word(pdev, PCI_COMMAND, &config);
+        if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
+                pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW;
+                printk(KERN_DEBUG "Boot video device is %s\n", pci_name(pdev));
+        }
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
+/*
+ * Some Toshiba laptops need extra code to enable their TI TSB43AB22/A.
+ *
+ * We pretend to bring them out of full D3 state, and restore the proper
+ * IRQ, PCI cache line size, and BARs, otherwise the device won't function
+ * properly.  In some cases, the device will generate an interrupt on
+ * the wrong IRQ line, causing any devices sharing the line it's
+ * *supposed* to use to be disabled by the kernel's IRQ debug code.
+ */
+static u16 toshiba_line_size;
+static struct dmi_system_id __devinitdata toshiba_ohci1394_dmi_table[] = {
+        {
+                .ident = "Toshiba PS5 based laptop",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+                        DMI_MATCH(DMI_PRODUCT_VERSION, "PS5"),
+                },
+        },
+        {
+                .ident = "Toshiba PSM4 based laptop",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+                        DMI_MATCH(DMI_PRODUCT_VERSION, "PSM4"),
+                },
+        },
+        {
+                .ident = "Toshiba A40 based laptop",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+                        DMI_MATCH(DMI_PRODUCT_VERSION, "PSA40U"),
+                },
+        },
+        { }
+};
+static void __devinit pci_pre_fixup_toshiba_ohci1394(struct pci_dev *dev)
+{
+        if (!dmi_check_system(toshiba_ohci1394_dmi_table))
+                return; /* only applies to certain Toshibas (so far) */
+        dev->current_state = PCI_D3cold;
+        pci_read_config_word(dev, PCI_CACHE_LINE_SIZE, &toshiba_line_size);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_TI, 0x8032,
+                         pci_pre_fixup_toshiba_ohci1394);
+static void __devinit pci_post_fixup_toshiba_ohci1394(struct pci_dev *dev)
+{
+        if (!dmi_check_system(toshiba_ohci1394_dmi_table))
+                return; /* only applies to certain Toshibas (so far) */
+        /* Restore config space on Toshiba laptops */
+        pci_write_config_word(dev, PCI_CACHE_LINE_SIZE, toshiba_line_size);
+        pci_read_config_byte(dev, PCI_INTERRUPT_LINE, (u8 *)&dev->irq);
+        pci_write_config_dword(dev, PCI_BASE_ADDRESS_0,
+                               pci_resource_start(dev, 0));
+        pci_write_config_dword(dev, PCI_BASE_ADDRESS_1,
+                               pci_resource_start(dev, 1));
+}
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_TI, 0x8032,
+                         pci_post_fixup_toshiba_ohci1394);
+/*
+ * Prevent the BIOS trapping accesses to the Cyrix CS5530A video device
+ * configuration space.
+ */
+static void pci_early_fixup_cyrix_5530(struct pci_dev *dev)
+{
+        u8 r;
+        /* clear 'F4 Video Configuration Trap' bit */
+        pci_read_config_byte(dev, 0x42, &r);
+        r &= 0xfd;
+        pci_write_config_byte(dev, 0x42, r);
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
+                        pci_early_fixup_cyrix_5530);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
+                        pci_early_fixup_cyrix_5530);
+/*
+ * Siemens Nixdorf AG FSC Multiprocessor Interrupt Controller:
+ * prevent update of the BAR0, which doesn't look like a normal BAR.
+ */
+static void __devinit pci_siemens_interrupt_controller(struct pci_dev *dev)
+{
+        dev->resource[0].flags |= IORESOURCE_PCI_FIXED;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015,
+                          pci_siemens_interrupt_controller);
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
new file mode 100644
index 000000000000..bcd2f94b732c
--- /dev/null
+++ b/arch/x86/pci/i386.c
@@ -0,0 +1,315 @@
+/*
+ *      Low-Level PCI Access for i386 machines
+ *
+ * Copyright 1993, 1994 Drew Eckhardt
+ *      Visionary Computing
+ *      (Unix and Linux consulting and custom programming)
+ *      Drew@Colorado.EDU
+ *      +1 (303) 786-7975
+ *
+ * Drew's work was sponsored by:
+ *      iX Multiuser Multitasking Magazine
+ *      Hannover, Germany
+ *      hm@ix.de
+ *
+ * Copyright 1997--2000 Martin Mares <mj@ucw.cz>
+ *
+ * For more information, please consult the following manuals (look at
+ * http://www.pcisig.com/ for how to get them):
+ *
+ * PCI BIOS Specification
+ * PCI Local Bus Specification
+ * PCI to PCI Bridge Specification
+ * PCI System Design Guide
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/errno.h>
+#include "pci.h"
+/*
+ * We need to avoid collisions with `mirrored' VGA ports
+ * and other strange ISA hardware, so we always want the
+ * addresses to be allocated in the 0x000-0x0ff region
+ * modulo 0x400.
+ *
+ * Why? Because some silly external IO cards only decode
+ * the low 10 bits of the IO address. The 0x00-0xff region
+ * is reserved for motherboard devices that decode all 16
+ * bits, so it's ok to allocate at, say, 0x2800-0x28ff,
+ * but we want to try to avoid allocating at 0x2900-0x2bff
+ * which might have be mirrored at 0x0100-0x03ff..
+ */
+void
+pcibios_align_resource(void *data, struct resource *res,
+                        resource_size_t size, resource_size_t align)
+{
+        if (res->flags & IORESOURCE_IO) {
+                resource_size_t start = res->start;
+                if (start & 0x300) {
+                        start = (start + 0x3ff) & ~0x3ff;
+                        res->start = start;
+                }
+        }
+}
+/*
+ *  Handle resources of PCI devices.  If the world were perfect, we could
+ *  just allocate all the resource regions and do nothing more.  It isn't.
+ *  On the other hand, we cannot just re-allocate all devices, as it would
+ *  require us to know lots of host bridge internals.  So we attempt to
+ *  keep as much of the original configuration as possible, but tweak it
+ *  when it's found to be wrong.
+ *
+ *  Known BIOS problems we have to work around:
+ *      - I/O or memory regions not configured
+ *      - regions configured, but not enabled in the command register
+ *      - bogus I/O addresses above 64K used
+ *      - expansion ROMs left enabled (this may sound harmless, but given
+ *        the fact the PCI specs explicitly allow address decoders to be
+ *        shared between expansion ROMs and other resource regions, it's
+ *        at least dangerous)
+ *
+ *  Our solution:
+ *      (1) Allocate resources for all buses behind PCI-to-PCI bridges.
+ *          This gives us fixed barriers on where we can allocate.
+ *      (2) Allocate resources for all enabled devices.  If there is
+ *          a collision, just mark the resource as unallocated. Also
+ *          disable expansion ROMs during this step.
+ *      (3) Try to allocate resources for disabled devices.  If the
+ *          resources were assigned correctly, everything goes well,
+ *          if they weren't, they won't disturb allocation of other
+ *          resources.
+ *      (4) Assign new addresses to resources which were either
+ *          not configured at all or misconfigured.  If explicitly
+ *          requested by the user, configure expansion ROM address
+ *          as well.
+ */
+static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
+{
+        struct pci_bus *bus;
+        struct pci_dev *dev;
+        int idx;
+        struct resource *r, *pr;
+        /* Depth-First Search on bus tree */
+        list_for_each_entry(bus, bus_list, node) {
+                if ((dev = bus->self)) {
+                        for (idx = PCI_BRIDGE_RESOURCES;
+                            idx < PCI_NUM_RESOURCES; idx++) {
+                                r = &dev->resource[idx];
+                                if (!r->flags)
+                                        continue;
+                                pr = pci_find_parent_resource(dev, r);
+                                if (!r->start || !pr ||
+                                    request_resource(pr, r) < 0) {
+                                        printk(KERN_ERR "PCI: Cannot allocate "
+                                                "resource region %d "
+                                                "of bridge %s\n",
+                                                idx, pci_name(dev));
+                                        /*
+                                         * Something is wrong with the region.
+                                         * Invalidate the resource to prevent
+                                         * child resource allocations in this
+                                         * range.
+                                         */
+                                        r->flags = 0;
+                                }
+                        }
+                }
+                pcibios_allocate_bus_resources(&bus->children);
+        }
+}
+static void __init pcibios_allocate_resources(int pass)
+{
+        struct pci_dev *dev = NULL;
+        int idx, disabled;
+        u16 command;
+        struct resource *r, *pr;
+        for_each_pci_dev(dev) {
+                pci_read_config_word(dev, PCI_COMMAND, &command);
+                for (idx = 0; idx < PCI_ROM_RESOURCE; idx++) {
+                        r = &dev->resource[idx];
+                        if (r->parent)          /* Already allocated */
+                                continue;
+                        if (!r->start)          /* Address not assigned at all */
+                                continue;
+                        if (r->flags & IORESOURCE_IO)
+                                disabled = !(command & PCI_COMMAND_IO);
+                        else
+                                disabled = !(command & PCI_COMMAND_MEMORY);
+                        if (pass == disabled) {
+                                DBG("PCI: Resource %08lx-%08lx "
+                                    "(f=%lx, d=%d, p=%d)\n",
+                                    r->start, r->end, r->flags, disabled, pass);
+                                pr = pci_find_parent_resource(dev, r);
+                                if (!pr || request_resource(pr, r) < 0) {
+                                        printk(KERN_ERR "PCI: Cannot allocate "
+                                                "resource region %d "
+                                                "of device %s\n",
+                                                idx, pci_name(dev));
+                                        /* We'll assign a new address later */
+                                        r->end -= r->start;
+                                        r->start = 0;
+                                }
+                        }
+                }
+                if (!pass) {
+                        r = &dev->resource[PCI_ROM_RESOURCE];
+                        if (r->flags & IORESOURCE_ROM_ENABLE) {
+                                /* Turn the ROM off, leave the resource region,
+                                 * but keep it unregistered. */
+                                u32 reg;
+                                DBG("PCI: Switching off ROM of %s\n",
+                                        pci_name(dev));
+                                r->flags &= ~IORESOURCE_ROM_ENABLE;
+                                pci_read_config_dword(dev,
+                                                dev->rom_base_reg, &reg);
+                                pci_write_config_dword(dev, dev->rom_base_reg,
+                                                reg & ~PCI_ROM_ADDRESS_ENABLE);
+                        }
+                }
+        }
+}
+static int __init pcibios_assign_resources(void)
+{
+        struct pci_dev *dev = NULL;
+        struct resource *r, *pr;
+        if (!(pci_probe & PCI_ASSIGN_ROMS)) {
+                /*
+                 * Try to use BIOS settings for ROMs, otherwise let
+                 * pci_assign_unassigned_resources() allocate the new
+                 * addresses.
+                 */
+                for_each_pci_dev(dev) {
+                        r = &dev->resource[PCI_ROM_RESOURCE];
+                        if (!r->flags || !r->start)
+                                continue;
+                        pr = pci_find_parent_resource(dev, r);
+                        if (!pr || request_resource(pr, r) < 0) {
+                                r->end -= r->start;
+                                r->start = 0;
+                        }
+                }
+        }
+        pci_assign_unassigned_resources();
+        return 0;
+}
+void __init pcibios_resource_survey(void)
+{
+        DBG("PCI: Allocating resources\n");
+        pcibios_allocate_bus_resources(&pci_root_buses);
+        pcibios_allocate_resources(0);
+        pcibios_allocate_resources(1);
+}
+/**
+ * called in fs_initcall (one below subsys_initcall),
+ * give a chance for motherboard reserve resources
+ */
+fs_initcall(pcibios_assign_resources);
+int pcibios_enable_resources(struct pci_dev *dev, int mask)
+{
+        u16 cmd, old_cmd;
+        int idx;
+        struct resource *r;
+        pci_read_config_word(dev, PCI_COMMAND, &cmd);
+        old_cmd = cmd;
+        for (idx = 0; idx < PCI_NUM_RESOURCES; idx++) {
+                /* Only set up the requested stuff */
+                if (!(mask & (1 << idx)))
+                        continue;
+                r = &dev->resource[idx];
+                if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+                        continue;
+                if ((idx == PCI_ROM_RESOURCE) &&
+                                (!(r->flags & IORESOURCE_ROM_ENABLE)))
+                        continue;
+                if (!r->start && r->end) {
+                        printk(KERN_ERR "PCI: Device %s not available "
+                                "because of resource %d collisions\n",
+                                pci_name(dev), idx);
+                        return -EINVAL;
+                }
+                if (r->flags & IORESOURCE_IO)
+                        cmd |= PCI_COMMAND_IO;
+                if (r->flags & IORESOURCE_MEM)
+                        cmd |= PCI_COMMAND_MEMORY;
+        }
+        if (cmd != old_cmd) {
+                printk("PCI: Enabling device %s (%04x -> %04x)\n",
+                        pci_name(dev), old_cmd, cmd);
+                pci_write_config_word(dev, PCI_COMMAND, cmd);
+        }
+        return 0;
+}
+/*
+ *  If we set up a device for bus mastering, we need to check the latency
+ *  timer as certain crappy BIOSes forget to set it properly.
+ */
+unsigned int pcibios_max_latency = 255;
+void pcibios_set_master(struct pci_dev *dev)
+{
+        u8 lat;
+        pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
+        if (lat < 16)
+                lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
+        else if (lat > pcibios_max_latency)
+                lat = pcibios_max_latency;
+        else
+                return;
+        printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n",
+                pci_name(dev), lat);
+        pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
+}
+int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+                        enum pci_mmap_state mmap_state, int write_combine)
+{
+        unsigned long prot;
+        /* I/O space cannot be accessed via normal processor loads and
+         * stores on this platform.
+         */
+        if (mmap_state == pci_mmap_io)
+                return -EINVAL;
+        /* Leave vm_pgoff as-is, the PCI space address is the physical
+         * address on this platform.
+         */
+        prot = pgprot_val(vma->vm_page_prot);
+        if (boot_cpu_data.x86 > 3)
+                prot |= _PAGE_PCD | _PAGE_PWT;
+        vma->vm_page_prot = __pgprot(prot);
+        /* Write-combine setting is ignored, it is changed via the mtrr
+         * interfaces on this platform.
+         */
+        if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+                               vma->vm_end - vma->vm_start,
+                               vma->vm_page_prot))
+                return -EAGAIN;
+        return 0;
+}
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
new file mode 100644
index 000000000000..3de9f9ba2da6
--- /dev/null
+++ b/arch/x86/pci/init.c
@@ -0,0 +1,37 @@
+#include <linux/pci.h>
+#include <linux/init.h>
+#include "pci.h"
+/* arch_initcall has too random ordering, so call the initializers
+   in the right sequence from here. */
+static __init int pci_access_init(void)
+{
+        int type __maybe_unused = 0;
+#ifdef CONFIG_PCI_DIRECT
+        type = pci_direct_probe();
+#endif
+#ifdef CONFIG_PCI_MMCONFIG
+        pci_mmcfg_init(type);
+#endif
+        if (raw_pci_ops)
+                return 0;
+#ifdef CONFIG_PCI_BIOS
+        pci_pcbios_init();
+#endif
+        /*
+         * don't check for raw_pci_ops here because we want pcbios as last
+         * fallback, yet it's needed to run first to set pcibios_last_bus
+         * in case legacy PCI probing is used. otherwise detecting peer busses
+         * fails.
+         */
+#ifdef CONFIG_PCI_DIRECT
+        pci_direct_init(type);
+#endif
+        if (!raw_pci_ops)
+                printk(KERN_ERR
+                "PCI: Fatal: No config space access function found\n");
+        return 0;
+}
+arch_initcall(pci_access_init);
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
new file mode 100644
index 000000000000..d98c6b096f8e
--- /dev/null
+++ b/arch/x86/pci/irq.c
@@ -0,0 +1,1173 @@
+/*
+ *      Low-Level PCI Support for PC -- Routing of Interrupts
+ *
+ *      (c) 1999--2000 Martin Mares <mj@ucw.cz>
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/dmi.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/io_apic.h>
+#include <linux/irq.h>
+#include <linux/acpi.h>
+#include "pci.h"
+#define PIRQ_SIGNATURE  (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
+#define PIRQ_VERSION 0x0100
+static int broken_hp_bios_irq9;
+static int acer_tm360_irqrouting;
+static struct irq_routing_table *pirq_table;
+static int pirq_enable_irq(struct pci_dev *dev);
+/*
+ * Never use: 0, 1, 2 (timer, keyboard, and cascade)
+ * Avoid using: 13, 14 and 15 (FP error and IDE).
+ * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
+ */
+unsigned int pcibios_irq_mask = 0xfff8;
+static int pirq_penalty[16] = {
+        1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
+        0, 0, 0, 0, 1000, 100000, 100000, 100000
+};
+struct irq_router {
+        char *name;
+        u16 vendor, device;
+        int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
+        int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
+};
+struct irq_router_handler {
+        u16 vendor;
+        int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
+};
+int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
+void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
+/*
+ *  Check passed address for the PCI IRQ Routing Table signature
+ *  and perform checksum verification.
+ */
+static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
+{
+        struct irq_routing_table *rt;
+        int i;
+        u8 sum;
+        rt = (struct irq_routing_table *) addr;
+        if (rt->signature != PIRQ_SIGNATURE ||
+            rt->version != PIRQ_VERSION ||
+            rt->size % 16 ||
+            rt->size < sizeof(struct irq_routing_table))
+                return NULL;
+        sum = 0;
+        for (i=0; i < rt->size; i++)
+                sum += addr[i];
+        if (!sum) {
+                DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
+                return rt;
+        }
+        return NULL;
+}
+/*
+ *  Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
+ */
+static struct irq_routing_table * __init pirq_find_routing_table(void)
+{
+        u8 *addr;
+        struct irq_routing_table *rt;
+        if (pirq_table_addr) {
+                rt = pirq_check_routing_table((u8 *) __va(pirq_table_addr));
+                if (rt)
+                        return rt;
+                printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
+        }
+        for(addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) {
+                rt = pirq_check_routing_table(addr);
+                if (rt)
+                        return rt;
+        }
+        return NULL;
+}
+/*
+ *  If we have a IRQ routing table, use it to search for peer host
+ *  bridges.  It's a gross hack, but since there are no other known
+ *  ways how to get a list of buses, we have to go this way.
+ */
+static void __init pirq_peer_trick(void)
+{
+        struct irq_routing_table *rt = pirq_table;
+        u8 busmap[256];
+        int i;
+        struct irq_info *e;
+        memset(busmap, 0, sizeof(busmap));
+        for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
+                e = &rt->slots[i];
+#ifdef DEBUG
+                {
+                        int j;
+                        DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
+                        for(j=0; j<4; j++)
+                                DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
+                        DBG("\n");
+                }
+#endif
+                busmap[e->bus] = 1;
+        }
+        for(i = 1; i < 256; i++) {
+                if (!busmap[i] || pci_find_bus(0, i))
+                        continue;
+                if (pci_scan_bus_with_sysdata(i))
+                        printk(KERN_INFO "PCI: Discovered primary peer "
+                               "bus %02x [IRQ]\n", i);
+        }
+        pcibios_last_bus = -1;
+}
+/*
+ *  Code for querying and setting of IRQ routes on various interrupt routers.
+ */
+void eisa_set_level_irq(unsigned int irq)
+{
+        unsigned char mask = 1 << (irq & 7);
+        unsigned int port = 0x4d0 + (irq >> 3);
+        unsigned char val;
+        static u16 eisa_irq_mask;
+        if (irq >= 16 || (1 << irq) & eisa_irq_mask)
+                return;
+        eisa_irq_mask |= (1 << irq);
+        printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
+        val = inb(port);
+        if (!(val & mask)) {
+                DBG(KERN_DEBUG " -> edge");
+                outb(val | mask, port);
+        }
+}
+/*
+ * Common IRQ routing practice: nybbles in config space,
+ * offset by some magic constant.
+ */
+static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
+{
+        u8 x;
+        unsigned reg = offset + (nr >> 1);
+        pci_read_config_byte(router, reg, &x);
+        return (nr & 1) ? (x >> 4) : (x & 0xf);
+}
+static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
+{
+        u8 x;
+        unsigned reg = offset + (nr >> 1);
+        pci_read_config_byte(router, reg, &x);
+        x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
+        pci_write_config_byte(router, reg, x);
+}
+/*
+ * ALI pirq entries are damn ugly, and completely undocumented.
+ * This has been figured out from pirq tables, and it's not a pretty
+ * picture.
+ */
+static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+        static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
+        return irqmap[read_config_nybble(router, 0x48, pirq-1)];
+}
+static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+        static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
+        unsigned int val = irqmap[irq];
+                
+        if (val) {
+                write_config_nybble(router, 0x48, pirq-1, val);
+                return 1;
+        }
+        return 0;
+}
+/*
+ * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
+ * just a pointer to the config space.
+ */
+static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+        u8 x;
+        pci_read_config_byte(router, pirq, &x);
+        return (x < 16) ? x : 0;
+}
+static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+        pci_write_config_byte(router, pirq, irq);
+        return 1;
+}
+/*
+ * The VIA pirq rules are nibble-based, like ALI,
+ * but without the ugly irq number munging.
+ * However, PIRQD is in the upper instead of lower 4 bits.
+ */
+static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+        return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
+}
+static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+        write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
+        return 1;
+}
+/*
+ * The VIA pirq rules are nibble-based, like ALI,
+ * but without the ugly irq number munging.
+ * However, for 82C586, nibble map is different .
+ */
+static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+        static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+        return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
+}
+static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+        static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+        write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
+        return 1;
+}
+/*
+ * ITE 8330G pirq rules are nibble-based
+ * FIXME: pirqmap may be { 1, 0, 3, 2 },
+ *        2+3 are both mapped to irq 9 on my system
+ */
+static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+        static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+        return read_config_nybble(router,0x43, pirqmap[pirq-1]);
+}
+static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+        static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+        write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
+        return 1;
+}
+/*
+ * OPTI: high four bits are nibble pointer..
+ * I wonder what the low bits do?
+ */
+static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+        return read_config_nybble(router, 0xb8, pirq >> 4);
+}
+static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+        write_config_nybble(router, 0xb8, pirq >> 4, irq);
+        return 1;
+}
+/*
+ * Cyrix: nibble offset 0x5C
+ * 0x5C bits 7:4 is INTB bits 3:0 is INTA 
+ * 0x5D bits 7:4 is INTD bits 3:0 is INTC
+ */
+static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+        return read_config_nybble(router, 0x5C, (pirq-1)^1);
+}
+static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+        write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
+        return 1;
+}
+/*
+ *      PIRQ routing for SiS 85C503 router used in several SiS chipsets.
+ *      We have to deal with the following issues here:
+ *      - vendors have different ideas about the meaning of link values
+ *      - some onboard devices (integrated in the chipset) have special
+ *        links and are thus routed differently (i.e. not via PCI INTA-INTD)
+ *      - different revision of the router have a different layout for
+ *        the routing registers, particularly for the onchip devices
+ *
+ *      For all routing registers the common thing is we have one byte
+ *      per routeable link which is defined as:
+ *               bit 7      IRQ mapping enabled (0) or disabled (1)
+ *               bits [6:4] reserved (sometimes used for onchip devices)
+ *               bits [3:0] IRQ to map to
+ *                   allowed: 3-7, 9-12, 14-15
+ *                   reserved: 0, 1, 2, 8, 13
+ *
+ *      The config-space registers located at 0x41/0x42/0x43/0x44 are
+ *      always used to route the normal PCI INT A/B/C/D respectively.
+ *      Apparently there are systems implementing PCI routing table using
+ *      link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
+ *      We try our best to handle both link mappings.
+ *      
+ *      Currently (2003-05-21) it appears most SiS chipsets follow the
+ *      definition of routing registers from the SiS-5595 southbridge.
+ *      According to the SiS 5595 datasheets the revision id's of the
+ *      router (ISA-bridge) should be 0x01 or 0xb0.
+ *
+ *      Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
+ *      Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
+ *      They seem to work with the current routing code. However there is
+ *      some concern because of the two USB-OHCI HCs (original SiS 5595
+ *      had only one). YMMV.
+ *
+ *      Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
+ *
+ *      0x61:   IDEIRQ:
+ *              bits [6:5] must be written 01
+ *              bit 4 channel-select primary (0), secondary (1)
+ *
+ *      0x62:   USBIRQ:
+ *              bit 6 OHCI function disabled (0), enabled (1)
+ *      
+ *      0x6a:   ACPI/SCI IRQ: bits 4-6 reserved
+ *
+ *      0x7e:   Data Acq. Module IRQ - bits 4-6 reserved
+ *
+ *      We support USBIRQ (in addition to INTA-INTD) and keep the
+ *      IDE, ACPI and DAQ routing untouched as set by the BIOS.
+ *
+ *      Currently the only reported exception is the new SiS 65x chipset
+ *      which includes the SiS 69x southbridge. Here we have the 85C503
+ *      router revision 0x04 and there are changes in the register layout
+ *      mostly related to the different USB HCs with USB 2.0 support.
+ *
+ *      Onchip routing for router rev-id 0x04 (try-and-error observation)
+ *
+ *      0x60/0x61/0x62/0x63:    1xEHCI and 3xOHCI (companion) USB-HCs
+ *                              bit 6-4 are probably unused, not like 5595
+ */
+#define PIRQ_SIS_IRQ_MASK       0x0f
+#define PIRQ_SIS_IRQ_DISABLE    0x80
+#define PIRQ_SIS_USB_ENABLE     0x40
+static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+        u8 x;
+        int reg;
+        reg = pirq;
+        if (reg >= 0x01 && reg <= 0x04)
+                reg += 0x40;
+        pci_read_config_byte(router, reg, &x);
+        return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
+}
+static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+        u8 x;
+        int reg;
+        reg = pirq;
+        if (reg >= 0x01 && reg <= 0x04)
+                reg += 0x40;
+        pci_read_config_byte(router, reg, &x);
+        x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
+        x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
+        pci_write_config_byte(router, reg, x);
+        return 1;
+}
+/*
+ * VLSI: nibble offset 0x74 - educated guess due to routing table and
+ *       config space of VLSI 82C534 PCI-bridge/router (1004:0102)
+ *       Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
+ *       devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
+ *       for the busbridge to the docking station.
+ */
+static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+        if (pirq > 8) {
+                printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+                return 0;
+        }
+        return read_config_nybble(router, 0x74, pirq-1);
+}
+static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+        if (pirq > 8) {
+                printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+                return 0;
+        }
+        write_config_nybble(router, 0x74, pirq-1, irq);
+        return 1;
+}
+/*
+ * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
+ * and Redirect I/O registers (0x0c00 and 0x0c01).  The Index register
+ * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a.  The Redirect
+ * register is a straight binary coding of desired PIC IRQ (low nibble).
+ *
+ * The 'link' value in the PIRQ table is already in the correct format
+ * for the Index register.  There are some special index values:
+ * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
+ * and 0x03 for SMBus.
+ */
+static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+        outb_p(pirq, 0xc00);
+        return inb(0xc01) & 0xf;
+}
+static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+        outb_p(pirq, 0xc00);
+        outb_p(irq, 0xc01);
+        return 1;
+}
+/* Support for AMD756 PCI IRQ Routing
+ * Jhon H. Caicedo <jhcaiced@osso.org.co>
+ * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
+ * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
+ * The AMD756 pirq rules are nibble-based
+ * offset 0x56 0-3 PIRQA  4-7  PIRQB
+ * offset 0x57 0-3 PIRQC  4-7  PIRQD
+ */
+static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+        u8 irq;
+        irq = 0;
+        if (pirq <= 4)
+        {
+                irq = read_config_nybble(router, 0x56, pirq - 1);
+        }
+        printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
+                dev->vendor, dev->device, pirq, irq);
+        return irq;
+}
+static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+        printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", 
+                dev->vendor, dev->device, pirq, irq);
+        if (pirq <= 4)
+        {
+                write_config_nybble(router, 0x56, pirq - 1, irq);
+        }
+        return 1;
+}
+#ifdef CONFIG_PCI_BIOS
+static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+        struct pci_dev *bridge;
+        int pin = pci_get_interrupt_pin(dev, &bridge);
+        return pcibios_set_irq_routing(bridge, pin, irq);
+}
+#endif
+static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+        static struct pci_device_id __initdata pirq_440gx[] = {
+                { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
+                { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
+                { },
+        };
+        /* 440GX has a proprietary PIRQ router -- don't use it */
+        if (pci_dev_present(pirq_440gx))
+                return 0;
+        switch(device)
+        {
+                case PCI_DEVICE_ID_INTEL_82371FB_0:
+                case PCI_DEVICE_ID_INTEL_82371SB_0:
+                case PCI_DEVICE_ID_INTEL_82371AB_0:
+                case PCI_DEVICE_ID_INTEL_82371MX:
+                case PCI_DEVICE_ID_INTEL_82443MX_0:
+                case PCI_DEVICE_ID_INTEL_82801AA_0:
+                case PCI_DEVICE_ID_INTEL_82801AB_0:
+                case PCI_DEVICE_ID_INTEL_82801BA_0:
+                case PCI_DEVICE_ID_INTEL_82801BA_10:
+                case PCI_DEVICE_ID_INTEL_82801CA_0:
+                case PCI_DEVICE_ID_INTEL_82801CA_12:
+                case PCI_DEVICE_ID_INTEL_82801DB_0:
+                case PCI_DEVICE_ID_INTEL_82801E_0:
+                case PCI_DEVICE_ID_INTEL_82801EB_0:
+                case PCI_DEVICE_ID_INTEL_ESB_1:
+                case PCI_DEVICE_ID_INTEL_ICH6_0:
+                case PCI_DEVICE_ID_INTEL_ICH6_1:
+                case PCI_DEVICE_ID_INTEL_ICH7_0:
+                case PCI_DEVICE_ID_INTEL_ICH7_1:
+                case PCI_DEVICE_ID_INTEL_ICH7_30:
+                case PCI_DEVICE_ID_INTEL_ICH7_31:
+                case PCI_DEVICE_ID_INTEL_ESB2_0:
+                case PCI_DEVICE_ID_INTEL_ICH8_0:
+                case PCI_DEVICE_ID_INTEL_ICH8_1:
+                case PCI_DEVICE_ID_INTEL_ICH8_2:
+                case PCI_DEVICE_ID_INTEL_ICH8_3:
+                case PCI_DEVICE_ID_INTEL_ICH8_4:
+                case PCI_DEVICE_ID_INTEL_ICH9_0:
+                case PCI_DEVICE_ID_INTEL_ICH9_1:
+                case PCI_DEVICE_ID_INTEL_ICH9_2:
+                case PCI_DEVICE_ID_INTEL_ICH9_3:
+                case PCI_DEVICE_ID_INTEL_ICH9_4:
+                case PCI_DEVICE_ID_INTEL_ICH9_5:
+                case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
+                        r->name = "PIIX/ICH";
+                        r->get = pirq_piix_get;
+                        r->set = pirq_piix_set;
+                        return 1;
+        }
+        return 0;
+}
+static __init int via_router_probe(struct irq_router *r,
+                                struct pci_dev *router, u16 device)
+{
+        /* FIXME: We should move some of the quirk fixup stuff here */
+        /*
+         * work arounds for some buggy BIOSes
+         */
+        if (device == PCI_DEVICE_ID_VIA_82C586_0) {
+                switch(router->device) {
+                case PCI_DEVICE_ID_VIA_82C686:
+                        /*
+                         * Asus k7m bios wrongly reports 82C686A
+                         * as 586-compatible
+                         */
+                        device = PCI_DEVICE_ID_VIA_82C686;
+                        break;
+                case PCI_DEVICE_ID_VIA_8235:
+                        /**
+                         * Asus a7v-x bios wrongly reports 8235
+                         * as 586-compatible
+                         */
+                        device = PCI_DEVICE_ID_VIA_8235;
+                        break;
+                }
+        }
+        switch(device) {
+        case PCI_DEVICE_ID_VIA_82C586_0:
+                r->name = "VIA";
+                r->get = pirq_via586_get;
+                r->set = pirq_via586_set;
+                return 1;
+        case PCI_DEVICE_ID_VIA_82C596:
+        case PCI_DEVICE_ID_VIA_82C686:
+        case PCI_DEVICE_ID_VIA_8231:
+        case PCI_DEVICE_ID_VIA_8233A:
+        case PCI_DEVICE_ID_VIA_8235:
+        case PCI_DEVICE_ID_VIA_8237:
+                /* FIXME: add new ones for 8233/5 */
+                r->name = "VIA";
+                r->get = pirq_via_get;
+                r->set = pirq_via_set;
+                return 1;
+        }
+        return 0;
+}
+static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+        switch(device)
+        {
+                case PCI_DEVICE_ID_VLSI_82C534:
+                        r->name = "VLSI 82C534";
+                        r->get = pirq_vlsi_get;
+                        r->set = pirq_vlsi_set;
+                        return 1;
+        }
+        return 0;
+}
+static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+        switch(device)
+        {
+                case PCI_DEVICE_ID_SERVERWORKS_OSB4:
+                case PCI_DEVICE_ID_SERVERWORKS_CSB5:
+                        r->name = "ServerWorks";
+                        r->get = pirq_serverworks_get;
+                        r->set = pirq_serverworks_set;
+                        return 1;
+        }
+        return 0;
+}
+static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+        if (device != PCI_DEVICE_ID_SI_503)
+                return 0;
+                
+        r->name = "SIS";
+        r->get = pirq_sis_get;
+        r->set = pirq_sis_set;
+        return 1;
+}
+static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+        switch(device)
+        {
+                case PCI_DEVICE_ID_CYRIX_5520:
+                        r->name = "NatSemi";
+                        r->get = pirq_cyrix_get;
+                        r->set = pirq_cyrix_set;
+                        return 1;
+        }
+        return 0;
+}
+static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+        switch(device)
+        {
+                case PCI_DEVICE_ID_OPTI_82C700:
+                        r->name = "OPTI";
+                        r->get = pirq_opti_get;
+                        r->set = pirq_opti_set;
+                        return 1;
+        }
+        return 0;
+}
+static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+        switch(device)
+        {
+                case PCI_DEVICE_ID_ITE_IT8330G_0:
+                        r->name = "ITE";
+                        r->get = pirq_ite_get;
+                        r->set = pirq_ite_set;
+                        return 1;
+        }
+        return 0;
+}
+static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+        switch(device)
+        {
+        case PCI_DEVICE_ID_AL_M1533:
+        case PCI_DEVICE_ID_AL_M1563:
+                printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
+                r->name = "ALI";
+                r->get = pirq_ali_get;
+                r->set = pirq_ali_set;
+                return 1;
+        }
+        return 0;
+}
+static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+        switch(device)
+        {
+                case PCI_DEVICE_ID_AMD_VIPER_740B:
+                        r->name = "AMD756";
+                        break;
+                case PCI_DEVICE_ID_AMD_VIPER_7413:
+                        r->name = "AMD766";
+                        break;
+                case PCI_DEVICE_ID_AMD_VIPER_7443:
+                        r->name = "AMD768";
+                        break;
+                default:
+                        return 0;
+        }
+        r->get = pirq_amd756_get;
+        r->set = pirq_amd756_set;
+        return 1;
+}
+                
+static __initdata struct irq_router_handler pirq_routers[] = {
+        { PCI_VENDOR_ID_INTEL, intel_router_probe },
+        { PCI_VENDOR_ID_AL, ali_router_probe },
+        { PCI_VENDOR_ID_ITE, ite_router_probe },
+        { PCI_VENDOR_ID_VIA, via_router_probe },
+        { PCI_VENDOR_ID_OPTI, opti_router_probe },
+        { PCI_VENDOR_ID_SI, sis_router_probe },
+        { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
+        { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
+        { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
+        { PCI_VENDOR_ID_AMD, amd_router_probe },
+        /* Someone with docs needs to add the ATI Radeon IGP */
+        { 0, NULL }
+};
+static struct irq_router pirq_router;
+static struct pci_dev *pirq_router_dev;
+/*
+ *      FIXME: should we have an option to say "generic for
+ *      chipset" ?
+ */
+ 
+static void __init pirq_find_router(struct irq_router *r)
+{
+        struct irq_routing_table *rt = pirq_table;
+        struct irq_router_handler *h;
+#ifdef CONFIG_PCI_BIOS
+        if (!rt->signature) {
+                printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
+                r->set = pirq_bios_set;
+                r->name = "BIOS";
+                return;
+        }
+#endif
+        /* Default unless a driver reloads it */
+        r->name = "default";
+        r->get = NULL;
+        r->set = NULL;
+        
+        DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
+            rt->rtr_vendor, rt->rtr_device);
+        pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
+        if (!pirq_router_dev) {
+                DBG(KERN_DEBUG "PCI: Interrupt router not found at "
+                        "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
+                return;
+        }
+        for( h = pirq_routers; h->vendor; h++) {
+                /* First look for a router match */
+                if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
+                        break;
+                /* Fall back to a device match */
+                if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
+                        break;
+        }
+        printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
+                pirq_router.name,
+                pirq_router_dev->vendor,
+                pirq_router_dev->device,
+                pci_name(pirq_router_dev));
+        /* The device remains referenced for the kernel lifetime */
+}
+static struct irq_info *pirq_get_info(struct pci_dev *dev)
+{
+        struct irq_routing_table *rt = pirq_table;
+        int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
+        struct irq_info *info;
+        for (info = rt->slots; entries--; info++)
+                if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
+                        return info;
+        return NULL;
+}
+static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
+{
+        u8 pin;
+        struct irq_info *info;
+        int i, pirq, newirq;
+        int irq = 0;
+        u32 mask;
+        struct irq_router *r = &pirq_router;
+        struct pci_dev *dev2 = NULL;
+        char *msg = NULL;
+        /* Find IRQ pin */
+        pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+        if (!pin) {
+                DBG(KERN_DEBUG " -> no interrupt pin\n");
+                return 0;
+        }
+        pin = pin - 1;
+        /* Find IRQ routing entry */
+        if (!pirq_table)
+                return 0;
+        
+        DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
+        info = pirq_get_info(dev);
+        if (!info) {
+                DBG(" -> not found in routing table\n" KERN_DEBUG);
+                return 0;
+        }
+        pirq = info->irq[pin].link;
+        mask = info->irq[pin].bitmap;
+        if (!pirq) {
+                DBG(" -> not routed\n" KERN_DEBUG);
+                return 0;
+        }
+        DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
+        mask &= pcibios_irq_mask;
+        /* Work around broken HP Pavilion Notebooks which assign USB to
+           IRQ 9 even though it is actually wired to IRQ 11 */
+        if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
+                dev->irq = 11;
+                pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
+                r->set(pirq_router_dev, dev, pirq, 11);
+        }
+        /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
+        if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
+                pirq = 0x68;
+                mask = 0x400;
+                dev->irq = r->get(pirq_router_dev, dev, pirq);
+                pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
+        }
+        /*
+         * Find the best IRQ to assign: use the one
+         * reported by the device if possible.
+         */
+        newirq = dev->irq;
+        if (newirq && !((1 << newirq) & mask)) {
+                if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
+                else printk("\n" KERN_WARNING
+                        "PCI: IRQ %i for device %s doesn't match PIRQ mask "
+                        "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
+                        pci_name(dev));
+        }
+        if (!newirq && assign) {
+                for (i = 0; i < 16; i++) {
+                        if (!(mask & (1 << i)))
+                                continue;
+                        if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
+                                newirq = i;
+                }
+        }
+        DBG(" -> newirq=%d", newirq);
+        /* Check if it is hardcoded */
+        if ((pirq & 0xf0) == 0xf0) {
+                irq = pirq & 0xf;
+                DBG(" -> hardcoded IRQ %d\n", irq);
+                msg = "Hardcoded";
+        } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
+        ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
+                DBG(" -> got IRQ %d\n", irq);
+                msg = "Found";
+                eisa_set_level_irq(irq);
+        } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
+                DBG(" -> assigning IRQ %d", newirq);
+                if (r->set(pirq_router_dev, dev, pirq, newirq)) {
+                        eisa_set_level_irq(newirq);
+                        DBG(" ... OK\n");
+                        msg = "Assigned";
+                        irq = newirq;
+                }
+        }
+        if (!irq) {
+                DBG(" ... failed\n");
+                if (newirq && mask == (1 << newirq)) {
+                        msg = "Guessed";
+                        irq = newirq;
+                } else
+                        return 0;
+        }
+        printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
+        /* Update IRQ for all devices with the same pirq value */
+        while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
+                pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
+                if (!pin)
+                        continue;
+                pin--;
+                info = pirq_get_info(dev2);
+                if (!info)
+                        continue;
+                if (info->irq[pin].link == pirq) {
+                        /* We refuse to override the dev->irq information. Give a warning! */
+                        if ( dev2->irq && dev2->irq != irq && \
+                        (!(pci_probe & PCI_USE_PIRQ_MASK) || \
+                        ((1 << dev2->irq) & mask)) ) {
+#ifndef CONFIG_PCI_MSI
+                                printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
+                                       pci_name(dev2), dev2->irq, irq);
+#endif
+                                continue;
+                        }
+                        dev2->irq = irq;
+                        pirq_penalty[irq]++;
+                        if (dev != dev2)
+                                printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
+                }
+        }
+        return 1;
+}
+static void __init pcibios_fixup_irqs(void)
+{
+        struct pci_dev *dev = NULL;
+        u8 pin;
+        DBG(KERN_DEBUG "PCI: IRQ fixup\n");
+        while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+                /*
+                 * If the BIOS has set an out of range IRQ number, just ignore it.
+                 * Also keep track of which IRQ's are already in use.
+                 */
+                if (dev->irq >= 16) {
+                        DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
+                        dev->irq = 0;
+                }
+                /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
+                if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
+                        pirq_penalty[dev->irq] = 0;
+                pirq_penalty[dev->irq]++;
+        }
+        dev = NULL;
+        while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+                pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+#ifdef CONFIG_X86_IO_APIC
+                /*
+                 * Recalculate IRQ numbers if we use the I/O APIC.
+                 */
+                if (io_apic_assign_pci_irqs)
+                {
+                        int irq;
+                        if (pin) {
+                                pin--;          /* interrupt pins are numbered starting from 1 */
+                                irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
+        /*
+         * Busses behind bridges are typically not listed in the MP-table.
+         * In this case we have to look up the IRQ based on the parent bus,
+         * parent slot, and pin number. The SMP code detects such bridged
+         * busses itself so we should get into this branch reliably.
+         */
+                                if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
+                                        struct pci_dev * bridge = dev->bus->self;
+                                        pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+                                        irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
+                                                        PCI_SLOT(bridge->devfn), pin);
+                                        if (irq >= 0)
+                                                printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
+                                                        pci_name(bridge), 'A' + pin, irq);
+                                }
+                                if (irq >= 0) {
+                                        printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
+                                                pci_name(dev), 'A' + pin, irq);
+                                        dev->irq = irq;
+                                }
+                        }
+                }
+#endif
+                /*
+                 * Still no IRQ? Try to lookup one...
+                 */
+                if (pin && !dev->irq)
+                        pcibios_lookup_irq(dev, 0);
+        }
+}
+/*
+ * Work around broken HP Pavilion Notebooks which assign USB to
+ * IRQ 9 even though it is actually wired to IRQ 11
+ */
+static int __init fix_broken_hp_bios_irq9(const struct dmi_system_id *d)
+{
+        if (!broken_hp_bios_irq9) {
+                broken_hp_bios_irq9 = 1;
+                printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
+        }
+        return 0;
+}
+/*
+ * Work around broken Acer TravelMate 360 Notebooks which assign
+ * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
+ */
+static int __init fix_acer_tm360_irqrouting(const struct dmi_system_id *d)
+{
+        if (!acer_tm360_irqrouting) {
+                acer_tm360_irqrouting = 1;
+                printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
+        }
+        return 0;
+}
+static struct dmi_system_id __initdata pciirq_dmi_table[] = {
+        {
+                .callback = fix_broken_hp_bios_irq9,
+                .ident = "HP Pavilion N5400 Series Laptop",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+                        DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
+                        DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
+                        DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
+                },
+        },
+        {
+                .callback = fix_acer_tm360_irqrouting,
+                .ident = "Acer TravelMate 36x Laptop",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
+                },
+        },
+        { }
+};
+static int __init pcibios_irq_init(void)
+{
+        DBG(KERN_DEBUG "PCI: IRQ init\n");
+        if (pcibios_enable_irq || raw_pci_ops == NULL)
+                return 0;
+        dmi_check_system(pciirq_dmi_table);
+        pirq_table = pirq_find_routing_table();
+#ifdef CONFIG_PCI_BIOS
+        if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
+                pirq_table = pcibios_get_irq_routing_table();
+#endif
+        if (pirq_table) {
+                pirq_peer_trick();
+                pirq_find_router(&pirq_router);
+                if (pirq_table->exclusive_irqs) {
+                        int i;
+                        for (i=0; i<16; i++)
+                                if (!(pirq_table->exclusive_irqs & (1 << i)))
+                                        pirq_penalty[i] += 100;
+                }
+                /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
+                if (io_apic_assign_pci_irqs)
+                        pirq_table = NULL;
+        }
+        pcibios_enable_irq = pirq_enable_irq;
+        pcibios_fixup_irqs();
+        return 0;
+}
+subsys_initcall(pcibios_irq_init);
+static void pirq_penalize_isa_irq(int irq, int active)
+{
+        /*
+         *  If any ISAPnP device reports an IRQ in its list of possible
+         *  IRQ's, we try to avoid assigning it to PCI devices.
+         */
+        if (irq < 16) {
+                if (active)
+                        pirq_penalty[irq] += 1000;
+                else
+                        pirq_penalty[irq] += 100;
+        }
+}
+void pcibios_penalize_isa_irq(int irq, int active)
+{
+#ifdef CONFIG_ACPI
+        if (!acpi_noirq)
+                acpi_penalize_isa_irq(irq, active);
+        else
+#endif
+                pirq_penalize_isa_irq(irq, active);
+}
+static int pirq_enable_irq(struct pci_dev *dev)
+{
+        u8 pin;
+        struct pci_dev *temp_dev;
+        pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+        if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
+                char *msg = "";
+                pin--;          /* interrupt pins are numbered starting from 1 */
+                if (io_apic_assign_pci_irqs) {
+                        int irq;
+                        irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
+                        /*
+                         * Busses behind bridges are typically not listed in the MP-table.
+                         * In this case we have to look up the IRQ based on the parent bus,
+                         * parent slot, and pin number. The SMP code detects such bridged
+                         * busses itself so we should get into this branch reliably.
+                         */
+                        temp_dev = dev;
+                        while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
+                                struct pci_dev * bridge = dev->bus->self;
+                                pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+                                irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
+                                                PCI_SLOT(bridge->devfn), pin);
+                                if (irq >= 0)
+                                        printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
+                                                pci_name(bridge), 'A' + pin, irq);
+                                dev = bridge;
+                        }
+                        dev = temp_dev;
+                        if (irq >= 0) {
+                                printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
+                                        pci_name(dev), 'A' + pin, irq);
+                                dev->irq = irq;
+                                return 0;
+                        } else
+                                msg = " Probably buggy MP table.";
+                } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
+                        msg = "";
+                else
+                        msg = " Please try using pci=biosirq.";
+                /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
+                if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
+                        return 0;
+                printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
+                       'A' + pin, pci_name(dev), msg);
+        }
+        return 0;
+}
diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/k8-bus_64.c
new file mode 100644
index 000000000000..9cc813e29706
--- /dev/null
+++ b/arch/x86/pci/k8-bus_64.c
@@ -0,0 +1,83 @@
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <asm/mpspec.h>
+#include <linux/cpumask.h>
+/*
+ * This discovers the pcibus <-> node mapping on AMD K8.
+ *
+ * RED-PEN need to call this again on PCI hotplug
+ * RED-PEN empty cpus get reported wrong
+ */
+#define NODE_ID_REGISTER 0x60
+#define NODE_ID(dword) (dword & 0x07)
+#define LDT_BUS_NUMBER_REGISTER_0 0x94
+#define LDT_BUS_NUMBER_REGISTER_1 0xB4
+#define LDT_BUS_NUMBER_REGISTER_2 0xD4
+#define NR_LDT_BUS_NUMBER_REGISTERS 3
+#define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 8) & 0xFF)
+#define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF)
+#define PCI_DEVICE_ID_K8HTCONFIG 0x1100
+/**
+ * fill_mp_bus_to_cpumask()
+ * fills the mp_bus_to_cpumask array based according to the LDT Bus Number
+ * Registers found in the K8 northbridge
+ */
+__init static int
+fill_mp_bus_to_cpumask(void)
+{
+        struct pci_dev *nb_dev = NULL;
+        int i, j;
+        u32 ldtbus, nid;
+        static int lbnr[3] = {
+                LDT_BUS_NUMBER_REGISTER_0,
+                LDT_BUS_NUMBER_REGISTER_1,
+                LDT_BUS_NUMBER_REGISTER_2
+        };
+        while ((nb_dev = pci_get_device(PCI_VENDOR_ID_AMD,
+                        PCI_DEVICE_ID_K8HTCONFIG, nb_dev))) {
+                pci_read_config_dword(nb_dev, NODE_ID_REGISTER, &nid);
+                for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) {
+                        pci_read_config_dword(nb_dev, lbnr[i], &ldtbus);
+                        /*
+                         * if there are no busses hanging off of the current
+                         * ldt link then both the secondary and subordinate
+                         * bus number fields are set to 0.
+                         * 
+                         * RED-PEN
+                         * This is slightly broken because it assumes
+                         * HT node IDs == Linux node ids, which is not always
+                         * true. However it is probably mostly true.
+                         */
+                        if (!(SECONDARY_LDT_BUS_NUMBER(ldtbus) == 0
+                                && SUBORDINATE_LDT_BUS_NUMBER(ldtbus) == 0)) {
+                                for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus);
+                                     j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus);
+                                     j++) { 
+                                        struct pci_bus *bus;
+                                        struct pci_sysdata *sd;
+                                        long node = NODE_ID(nid);
+                                        /* Algorithm a bit dumb, but
+                                           it shouldn't matter here */
+                                        bus = pci_find_bus(0, j);
+                                        if (!bus)
+                                                continue;
+                                        if (!node_online(node))
+                                                node = 0;
+                                        sd = bus->sysdata;
+                                        sd->node = node;
+                                }               
+                        }
+                }
+        }
+        return 0;
+}
+fs_initcall(fill_mp_bus_to_cpumask);
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
new file mode 100644
index 000000000000..5565d7016b75
--- /dev/null
+++ b/arch/x86/pci/legacy.c
@@ -0,0 +1,56 @@
+/*
+ * legacy.c - traditional, old school PCI bus probing
+ */
+#include <linux/init.h>
+#include <linux/pci.h>
+#include "pci.h"
+/*
+ * Discover remaining PCI buses in case there are peer host bridges.
+ * We use the number of last PCI bus provided by the PCI BIOS.
+ */
+static void __devinit pcibios_fixup_peer_bridges(void)
+{
+        int n, devfn;
+        if (pcibios_last_bus <= 0 || pcibios_last_bus >= 0xff)
+                return;
+        DBG("PCI: Peer bridge fixup\n");
+        for (n=0; n <= pcibios_last_bus; n++) {
+                u32 l;
+                if (pci_find_bus(0, n))
+                        continue;
+                for (devfn = 0; devfn < 256; devfn += 8) {
+                        if (!raw_pci_ops->read(0, n, devfn, PCI_VENDOR_ID, 2, &l) &&
+                            l != 0x0000 && l != 0xffff) {
+                                DBG("Found device at %02x:%02x [%04x]\n", n, devfn, l);
+                                printk(KERN_INFO "PCI: Discovered peer bus %02x\n", n);
+                                pci_scan_bus_with_sysdata(n);
+                                break;
+                        }
+                }
+        }
+}
+static int __init pci_legacy_init(void)
+{
+        if (!raw_pci_ops) {
+                printk("PCI: System does not support PCI\n");
+                return 0;
+        }
+        if (pcibios_scanned++)
+                return 0;
+        printk("PCI: Probing PCI hardware\n");
+        pci_root_bus = pcibios_scan_root(0);
+        if (pci_root_bus)
+                pci_bus_add_devices(pci_root_bus);
+        pcibios_fixup_peer_bridges();
+        return 0;
+}
+subsys_initcall(pci_legacy_init);
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
new file mode 100644
index 000000000000..4df637e34f81
--- /dev/null
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -0,0 +1,315 @@
+/*
+ * mmconfig-shared.c - Low-level direct PCI config space access via
+ *                     MMCONFIG - common code between i386 and x86-64.
+ *
+ * This code does:
+ * - known chipset handling
+ * - ACPI decoding and validation
+ *
+ * Per-architecture code takes care of the mappings and accesses
+ * themselves.
+ */
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/bitmap.h>
+#include <asm/e820.h>
+#include "pci.h"
+/* aperture is up to 256MB but BIOS may reserve less */
+#define MMCONFIG_APER_MIN       (2 * 1024*1024)
+#define MMCONFIG_APER_MAX       (256 * 1024*1024)
+DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
+/* Indicate if the mmcfg resources have been placed into the resource table. */
+static int __initdata pci_mmcfg_resources_inserted;
+/* K8 systems have some devices (typically in the builtin northbridge)
+   that are only accessible using type1
+   Normally this can be expressed in the MCFG by not listing them
+   and assigning suitable _SEGs, but this isn't implemented in some BIOS.
+   Instead try to discover all devices on bus 0 that are unreachable using MM
+   and fallback for them. */
+static void __init unreachable_devices(void)
+{
+        int i, bus;
+        /* Use the max bus number from ACPI here? */
+        for (bus = 0; bus < PCI_MMCFG_MAX_CHECK_BUS; bus++) {
+                for (i = 0; i < 32; i++) {
+                        unsigned int devfn = PCI_DEVFN(i, 0);
+                        u32 val1, val2;
+                        pci_conf1_read(0, bus, devfn, 0, 4, &val1);
+                        if (val1 == 0xffffffff)
+                                continue;
+                        if (pci_mmcfg_arch_reachable(0, bus, devfn)) {
+                                raw_pci_ops->read(0, bus, devfn, 0, 4, &val2);
+                                if (val1 == val2)
+                                        continue;
+                        }
+                        set_bit(i + 32 * bus, pci_mmcfg_fallback_slots);
+                        printk(KERN_NOTICE "PCI: No mmconfig possible on device"
+                               " %02x:%02x\n", bus, i);
+                }
+        }
+}
+static const char __init *pci_mmcfg_e7520(void)
+{
+        u32 win;
+        pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win);
+        win = win & 0xf000;
+        if(win == 0x0000 || win == 0xf000)
+                pci_mmcfg_config_num = 0;
+        else {
+                pci_mmcfg_config_num = 1;
+                pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
+                if (!pci_mmcfg_config)
+                        return NULL;
+                pci_mmcfg_config[0].address = win << 16;
+                pci_mmcfg_config[0].pci_segment = 0;
+                pci_mmcfg_config[0].start_bus_number = 0;
+                pci_mmcfg_config[0].end_bus_number = 255;
+        }
+        return "Intel Corporation E7520 Memory Controller Hub";
+}
+static const char __init *pci_mmcfg_intel_945(void)
+{
+        u32 pciexbar, mask = 0, len = 0;
+        pci_mmcfg_config_num = 1;
+        pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0x48, 4, &pciexbar);
+        /* Enable bit */
+        if (!(pciexbar & 1))
+                pci_mmcfg_config_num = 0;
+        /* Size bits */
+        switch ((pciexbar >> 1) & 3) {
+        case 0:
+                mask = 0xf0000000U;
+                len  = 0x10000000U;
+                break;
+        case 1:
+                mask = 0xf8000000U;
+                len  = 0x08000000U;
+                break;
+        case 2:
+                mask = 0xfc000000U;
+                len  = 0x04000000U;
+                break;
+        default:
+                pci_mmcfg_config_num = 0;
+        }
+        /* Errata #2, things break when not aligned on a 256Mb boundary */
+        /* Can only happen in 64M/128M mode */
+        if ((pciexbar & mask) & 0x0fffffffU)
+                pci_mmcfg_config_num = 0;
+        /* Don't hit the APIC registers and their friends */
+        if ((pciexbar & mask) >= 0xf0000000U)
+                pci_mmcfg_config_num = 0;
+        if (pci_mmcfg_config_num) {
+                pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL);
+                if (!pci_mmcfg_config)
+                        return NULL;
+                pci_mmcfg_config[0].address = pciexbar & mask;
+                pci_mmcfg_config[0].pci_segment = 0;
+                pci_mmcfg_config[0].start_bus_number = 0;
+                pci_mmcfg_config[0].end_bus_number = (len >> 20) - 1;
+        }
+        return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
+}
+struct pci_mmcfg_hostbridge_probe {
+        u32 vendor;
+        u32 device;
+        const char *(*probe)(void);
+};
+static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
+        { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 },
+        { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82945G_HB, pci_mmcfg_intel_945 },
+};
+static int __init pci_mmcfg_check_hostbridge(void)
+{
+        u32 l;
+        u16 vendor, device;
+        int i;
+        const char *name;
+        pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0, 4, &l);
+        vendor = l & 0xffff;
+        device = (l >> 16) & 0xffff;
+        pci_mmcfg_config_num = 0;
+        pci_mmcfg_config = NULL;
+        name = NULL;
+        for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
+                if (pci_mmcfg_probes[i].vendor == vendor &&
+                    pci_mmcfg_probes[i].device == device)
+                        name = pci_mmcfg_probes[i].probe();
+        }
+        if (name) {
+                printk(KERN_INFO "PCI: Found %s %s MMCONFIG support.\n",
+                       name, pci_mmcfg_config_num ? "with" : "without");
+        }
+        return name != NULL;
+}
+static void __init pci_mmcfg_insert_resources(unsigned long resource_flags)
+{
+#define PCI_MMCFG_RESOURCE_NAME_LEN 19
+        int i;
+        struct resource *res;
+        char *names;
+        unsigned num_buses;
+        res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
+                        pci_mmcfg_config_num, GFP_KERNEL);
+        if (!res) {
+                printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
+                return;
+        }
+        names = (void *)&res[pci_mmcfg_config_num];
+        for (i = 0; i < pci_mmcfg_config_num; i++, res++) {
+                struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i];
+                num_buses = cfg->end_bus_number - cfg->start_bus_number + 1;
+                res->name = names;
+                snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u",
+                         cfg->pci_segment);
+                res->start = cfg->address;
+                res->end = res->start + (num_buses << 20) - 1;
+                res->flags = IORESOURCE_MEM | resource_flags;
+                insert_resource(&iomem_resource, res);
+                names += PCI_MMCFG_RESOURCE_NAME_LEN;
+        }
+        /* Mark that the resources have been inserted. */
+        pci_mmcfg_resources_inserted = 1;
+}
+static void __init pci_mmcfg_reject_broken(int type)
+{
+        typeof(pci_mmcfg_config[0]) *cfg;
+        if ((pci_mmcfg_config_num == 0) ||
+            (pci_mmcfg_config == NULL) ||
+            (pci_mmcfg_config[0].address == 0))
+                return;
+        cfg = &pci_mmcfg_config[0];
+        /*
+         * Handle more broken MCFG tables on Asus etc.
+         * They only contain a single entry for bus 0-0.
+         */
+        if (pci_mmcfg_config_num == 1 &&
+            cfg->pci_segment == 0 &&
+            (cfg->start_bus_number | cfg->end_bus_number) == 0) {
+                printk(KERN_ERR "PCI: start and end of bus number is 0. "
+                       "Rejected as broken MCFG.\n");
+                goto reject;
+        }
+        /*
+         * Only do this check when type 1 works. If it doesn't work
+         * assume we run on a Mac and always use MCFG
+         */
+        if (type == 1 && !e820_all_mapped(cfg->address,
+                                          cfg->address + MMCONFIG_APER_MIN,
+                                          E820_RESERVED)) {
+                printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not"
+                       " E820-reserved\n", cfg->address);
+                goto reject;
+        }
+        return;
+reject:
+        printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
+        kfree(pci_mmcfg_config);
+        pci_mmcfg_config = NULL;
+        pci_mmcfg_config_num = 0;
+}
+void __init pci_mmcfg_init(int type)
+{
+        int known_bridge = 0;
+        if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+                return;
+        if (type == 1 && pci_mmcfg_check_hostbridge())
+                known_bridge = 1;
+        if (!known_bridge) {
+                acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
+                pci_mmcfg_reject_broken(type);
+        }
+        if ((pci_mmcfg_config_num == 0) ||
+            (pci_mmcfg_config == NULL) ||
+            (pci_mmcfg_config[0].address == 0))
+                return;
+        if (pci_mmcfg_arch_init()) {
+                if (type == 1)
+                        unreachable_devices();
+                if (known_bridge)
+                        pci_mmcfg_insert_resources(IORESOURCE_BUSY);
+                pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+        } else {
+                /*
+                 * Signal not to attempt to insert mmcfg resources because
+                 * the architecture mmcfg setup could not initialize.
+                 */
+                pci_mmcfg_resources_inserted = 1;
+        }
+}
+static int __init pci_mmcfg_late_insert_resources(void)
+{
+        /*
+         * If resources are already inserted or we are not using MMCONFIG,
+         * don't insert the resources.
+         */
+        if ((pci_mmcfg_resources_inserted == 1) ||
+            (pci_probe & PCI_PROBE_MMCONF) == 0 ||
+            (pci_mmcfg_config_num == 0) ||
+            (pci_mmcfg_config == NULL) ||
+            (pci_mmcfg_config[0].address == 0))
+                return 1;
+        /*
+         * Attempt to insert the mmcfg resources but not with the busy flag
+         * marked so it won't cause request errors when __request_region is
+         * called.
+         */
+        pci_mmcfg_insert_resources(0);
+        return 0;
+}
+/*
+ * Perform MMCONFIG resource insertion after PCI initialization to allow for
+ * misprogrammed MCFG tables that state larger sizes but actually conflict
+ * with other system resources.
+ */
+late_initcall(pci_mmcfg_late_insert_resources);
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
new file mode 100644
index 000000000000..1bf5816d34c8
--- /dev/null
+++ b/arch/x86/pci/mmconfig_32.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2004 Matthew Wilcox <matthew@wil.cx>
+ * Copyright (C) 2004 Intel Corp.
+ *
+ * This code is released under the GNU General Public License version 2.
+ */
+/*
+ * mmconfig.c - Low-level direct PCI config space access via MMCONFIG
+ */
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <asm/e820.h>
+#include "pci.h"
+/* Assume systems with more busses have correct MCFG */
+#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
+/* The base address of the last MMCONFIG device accessed */
+static u32 mmcfg_last_accessed_device;
+static int mmcfg_last_accessed_cpu;
+/*
+ * Functions for accessing PCI configuration space with MMCONFIG accesses
+ */
+static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
+{
+        struct acpi_mcfg_allocation *cfg;
+        int cfg_num;
+        if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS &&
+            test_bit(PCI_SLOT(devfn) + 32*bus, pci_mmcfg_fallback_slots))
+                return 0;
+        for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
+                cfg = &pci_mmcfg_config[cfg_num];
+                if (cfg->pci_segment == seg &&
+                    (cfg->start_bus_number <= bus) &&
+                    (cfg->end_bus_number >= bus))
+                        return cfg->address;
+        }
+        /* Fall back to type 0 */
+        return 0;
+}
+/*
+ * This is always called under pci_config_lock
+ */
+static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn)
+{
+        u32 dev_base = base | (bus << 20) | (devfn << 12);
+        int cpu = smp_processor_id();
+        if (dev_base != mmcfg_last_accessed_device ||
+            cpu != mmcfg_last_accessed_cpu) {
+                mmcfg_last_accessed_device = dev_base;
+                mmcfg_last_accessed_cpu = cpu;
+                set_fixmap_nocache(FIX_PCIE_MCFG, dev_base);
+        }
+}
+static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
+                          unsigned int devfn, int reg, int len, u32 *value)
+{
+        unsigned long flags;
+        u32 base;
+        if ((bus > 255) || (devfn > 255) || (reg > 4095)) {
+                *value = -1;
+                return -EINVAL;
+        }
+        base = get_base_addr(seg, bus, devfn);
+        if (!base)
+                return pci_conf1_read(seg,bus,devfn,reg,len,value);
+        spin_lock_irqsave(&pci_config_lock, flags);
+        pci_exp_set_dev_base(base, bus, devfn);
+        switch (len) {
+        case 1:
+                *value = mmio_config_readb(mmcfg_virt_addr + reg);
+                break;
+        case 2:
+                *value = mmio_config_readw(mmcfg_virt_addr + reg);
+                break;
+        case 4:
+                *value = mmio_config_readl(mmcfg_virt_addr + reg);
+                break;
+        }
+        spin_unlock_irqrestore(&pci_config_lock, flags);
+        return 0;
+}
+static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
+                           unsigned int devfn, int reg, int len, u32 value)
+{
+        unsigned long flags;
+        u32 base;
+        if ((bus > 255) || (devfn > 255) || (reg > 4095))
+                return -EINVAL;
+        base = get_base_addr(seg, bus, devfn);
+        if (!base)
+                return pci_conf1_write(seg,bus,devfn,reg,len,value);
+        spin_lock_irqsave(&pci_config_lock, flags);
+        pci_exp_set_dev_base(base, bus, devfn);
+        switch (len) {
+        case 1:
+                mmio_config_writeb(mmcfg_virt_addr + reg, value);
+                break;
+        case 2:
+                mmio_config_writew(mmcfg_virt_addr + reg, value);
+                break;
+        case 4:
+                mmio_config_writel(mmcfg_virt_addr + reg, value);
+                break;
+        }
+        spin_unlock_irqrestore(&pci_config_lock, flags);
+        return 0;
+}
+static struct pci_raw_ops pci_mmcfg = {
+        .read =         pci_mmcfg_read,
+        .write =        pci_mmcfg_write,
+};
+int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus,
+                                    unsigned int devfn)
+{
+        return get_base_addr(seg, bus, devfn) != 0;
+}
+int __init pci_mmcfg_arch_init(void)
+{
+        printk(KERN_INFO "PCI: Using MMCONFIG\n");
+        raw_pci_ops = &pci_mmcfg;
+        return 1;
+}
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
new file mode 100644
index 000000000000..4095e4d66a1d
--- /dev/null
+++ b/arch/x86/pci/mmconfig_64.c
@@ -0,0 +1,157 @@
+/*
+ * mmconfig.c - Low-level direct PCI config space access via MMCONFIG
+ *
+ * This is an 64bit optimized version that always keeps the full mmconfig
+ * space mapped. This allows lockless config space operation.
+ */
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/bitmap.h>
+#include <asm/e820.h>
+#include "pci.h"
+/* Static virtual mapping of the MMCONFIG aperture */
+struct mmcfg_virt {
+        struct acpi_mcfg_allocation *cfg;
+        char __iomem *virt;
+};
+static struct mmcfg_virt *pci_mmcfg_virt;
+static char __iomem *get_virt(unsigned int seg, unsigned bus)
+{
+        struct acpi_mcfg_allocation *cfg;
+        int cfg_num;
+        for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
+                cfg = pci_mmcfg_virt[cfg_num].cfg;
+                if (cfg->pci_segment == seg &&
+                    (cfg->start_bus_number <= bus) &&
+                    (cfg->end_bus_number >= bus))
+                        return pci_mmcfg_virt[cfg_num].virt;
+        }
+        /* Fall back to type 0 */
+        return NULL;
+}
+static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
+{
+        char __iomem *addr;
+        if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS &&
+                test_bit(32*bus + PCI_SLOT(devfn), pci_mmcfg_fallback_slots))
+                return NULL;
+        addr = get_virt(seg, bus);
+        if (!addr)
+                return NULL;
+        return addr + ((bus << 20) | (devfn << 12));
+}
+static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
+                          unsigned int devfn, int reg, int len, u32 *value)
+{
+        char __iomem *addr;
+        /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
+        if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) {
+                *value = -1;
+                return -EINVAL;
+        }
+        addr = pci_dev_base(seg, bus, devfn);
+        if (!addr)
+                return pci_conf1_read(seg,bus,devfn,reg,len,value);
+        switch (len) {
+        case 1:
+                *value = mmio_config_readb(addr + reg);
+                break;
+        case 2:
+                *value = mmio_config_readw(addr + reg);
+                break;
+        case 4:
+                *value = mmio_config_readl(addr + reg);
+                break;
+        }
+        return 0;
+}
+static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
+                           unsigned int devfn, int reg, int len, u32 value)
+{
+        char __iomem *addr;
+        /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
+        if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
+                return -EINVAL;
+        addr = pci_dev_base(seg, bus, devfn);
+        if (!addr)
+                return pci_conf1_write(seg,bus,devfn,reg,len,value);
+        switch (len) {
+        case 1:
+                mmio_config_writeb(addr + reg, value);
+                break;
+        case 2:
+                mmio_config_writew(addr + reg, value);
+                break;
+        case 4:
+                mmio_config_writel(addr + reg, value);
+                break;
+        }
+        return 0;
+}
+static struct pci_raw_ops pci_mmcfg = {
+        .read =         pci_mmcfg_read,
+        .write =        pci_mmcfg_write,
+};
+static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
+{
+        void __iomem *addr;
+        u32 size;
+        size = (cfg->end_bus_number + 1) << 20;
+        addr = ioremap_nocache(cfg->address, size);
+        if (addr) {
+                printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n",
+                       cfg->address, cfg->address + size - 1);
+        }
+        return addr;
+}
+int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus,
+                                    unsigned int devfn)
+{
+        return pci_dev_base(seg, bus, devfn) != NULL;
+}
+int __init pci_mmcfg_arch_init(void)
+{
+        int i;
+        pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) *
+                                 pci_mmcfg_config_num, GFP_KERNEL);
+        if (pci_mmcfg_virt == NULL) {
+                printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n");
+                return 0;
+        }
+        for (i = 0; i < pci_mmcfg_config_num; ++i) {
+                pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i];
+                pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]);
+                if (!pci_mmcfg_virt[i].virt) {
+                        printk(KERN_ERR "PCI: Cannot map mmconfig aperture for "
+                                        "segment %d\n",
+                                pci_mmcfg_config[i].pci_segment);
+                        return 0;
+                }
+        }
+        raw_pci_ops = &pci_mmcfg;
+        return 1;
+}
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c
new file mode 100644
index 000000000000..f5f165f69e0c
--- /dev/null
+++ b/arch/x86/pci/numa.c
@@ -0,0 +1,135 @@
+/*
+ * numa.c - Low-level PCI access for NUMA-Q machines
+ */
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/nodemask.h>
+#include "pci.h"
+#define BUS2QUAD(global) (mp_bus_id_to_node[global])
+#define BUS2LOCAL(global) (mp_bus_id_to_local[global])
+#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
+#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
+        (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3))
+static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
+                             unsigned int devfn, int reg, int len, u32 *value)
+{
+        unsigned long flags;
+        if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
+                return -EINVAL;
+        spin_lock_irqsave(&pci_config_lock, flags);
+        outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus));
+        switch (len) {
+        case 1:
+                *value = inb_quad(0xCFC + (reg & 3), BUS2QUAD(bus));
+                break;
+        case 2:
+                *value = inw_quad(0xCFC + (reg & 2), BUS2QUAD(bus));
+                break;
+        case 4:
+                *value = inl_quad(0xCFC, BUS2QUAD(bus));
+                break;
+        }
+        spin_unlock_irqrestore(&pci_config_lock, flags);
+        return 0;
+}
+static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
+                              unsigned int devfn, int reg, int len, u32 value)
+{
+        unsigned long flags;
+        if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 
+                return -EINVAL;
+        spin_lock_irqsave(&pci_config_lock, flags);
+        outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus));
+        switch (len) {
+        case 1:
+                outb_quad((u8)value, 0xCFC + (reg & 3), BUS2QUAD(bus));
+                break;
+        case 2:
+                outw_quad((u16)value, 0xCFC + (reg & 2), BUS2QUAD(bus));
+                break;
+        case 4:
+                outl_quad((u32)value, 0xCFC, BUS2QUAD(bus));
+                break;
+        }
+        spin_unlock_irqrestore(&pci_config_lock, flags);
+        return 0;
+}
+#undef PCI_CONF1_MQ_ADDRESS
+static struct pci_raw_ops pci_direct_conf1_mq = {
+        .read   = pci_conf1_mq_read,
+        .write  = pci_conf1_mq_write
+};
+static void __devinit pci_fixup_i450nx(struct pci_dev *d)
+{
+        /*
+         * i450NX -- Find and scan all secondary buses on all PXB's.
+         */
+        int pxb, reg;
+        u8 busno, suba, subb;
+        int quad = BUS2QUAD(d->bus->number);
+        printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d));
+        reg = 0xd0;
+        for(pxb=0; pxb<2; pxb++) {
+                pci_read_config_byte(d, reg++, &busno);
+                pci_read_config_byte(d, reg++, &suba);
+                pci_read_config_byte(d, reg++, &subb);
+                DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb);
+                if (busno) {
+                        /* Bus A */
+                        pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno));
+                }
+                if (suba < subb) {
+                        /* Bus B */
+                        pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, suba+1));
+                }
+        }
+        pcibios_last_bus = -1;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
+static int __init pci_numa_init(void)
+{
+        int quad;
+        raw_pci_ops = &pci_direct_conf1_mq;
+        if (pcibios_scanned++)
+                return 0;
+        pci_root_bus = pcibios_scan_root(0);
+        if (pci_root_bus)
+                pci_bus_add_devices(pci_root_bus);
+        if (num_online_nodes() > 1)
+                for_each_online_node(quad) {
+                        if (quad == 0)
+                                continue;
+                        printk("Scanning PCI bus %d for quad %d\n", 
+                                QUADLOCAL2BUS(quad,0), quad);
+                        pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, 0));
+                }
+        return 0;
+}
+subsys_initcall(pci_numa_init);
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
new file mode 100644
index 000000000000..10ac8c316c46
--- /dev/null
+++ b/arch/x86/pci/pcbios.c
@@ -0,0 +1,492 @@
+/*
+ * BIOS32 and PCI BIOS handling.
+ */
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include "pci.h"
+#include "pci-functions.h"
+/* BIOS32 signature: "_32_" */
+#define BIOS32_SIGNATURE        (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
+/* PCI signature: "PCI " */
+#define PCI_SIGNATURE           (('P' << 0) + ('C' << 8) + ('I' << 16) + (' ' << 24))
+/* PCI service signature: "$PCI" */
+#define PCI_SERVICE             (('$' << 0) + ('P' << 8) + ('C' << 16) + ('I' << 24))
+/* PCI BIOS hardware mechanism flags */
+#define PCIBIOS_HW_TYPE1                0x01
+#define PCIBIOS_HW_TYPE2                0x02
+#define PCIBIOS_HW_TYPE1_SPEC           0x10
+#define PCIBIOS_HW_TYPE2_SPEC           0x20
+/*
+ * This is the standard structure used to identify the entry point
+ * to the BIOS32 Service Directory, as documented in
+ *      Standard BIOS 32-bit Service Directory Proposal
+ *      Revision 0.4 May 24, 1993
+ *      Phoenix Technologies Ltd.
+ *      Norwood, MA
+ * and the PCI BIOS specification.
+ */
+union bios32 {
+        struct {
+                unsigned long signature;        /* _32_ */
+                unsigned long entry;            /* 32 bit physical address */
+                unsigned char revision;         /* Revision level, 0 */
+                unsigned char length;           /* Length in paragraphs should be 01 */
+                unsigned char checksum;         /* All bytes must add up to zero */
+                unsigned char reserved[5];      /* Must be zero */
+        } fields;
+        char chars[16];
+};
+/*
+ * Physical address of the service directory.  I don't know if we're
+ * allowed to have more than one of these or not, so just in case
+ * we'll make pcibios_present() take a memory start parameter and store
+ * the array there.
+ */
+static struct {
+        unsigned long address;
+        unsigned short segment;
+} bios32_indirect = { 0, __KERNEL_CS };
+/*
+ * Returns the entry point for the given service, NULL on error
+ */
+static unsigned long bios32_service(unsigned long service)
+{
+        unsigned char return_code;      /* %al */
+        unsigned long address;          /* %ebx */
+        unsigned long length;           /* %ecx */
+        unsigned long entry;            /* %edx */
+        unsigned long flags;
+        local_irq_save(flags);
+        __asm__("lcall *(%%edi); cld"
+                : "=a" (return_code),
+                  "=b" (address),
+                  "=c" (length),
+                  "=d" (entry)
+                : "0" (service),
+                  "1" (0),
+                  "D" (&bios32_indirect));
+        local_irq_restore(flags);
+        switch (return_code) {
+                case 0:
+                        return address + entry;
+                case 0x80:      /* Not present */
+                        printk(KERN_WARNING "bios32_service(0x%lx): not present\n", service);
+                        return 0;
+                default: /* Shouldn't happen */
+                        printk(KERN_WARNING "bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n",
+                                service, return_code);
+                        return 0;
+        }
+}
+static struct {
+        unsigned long address;
+        unsigned short segment;
+} pci_indirect = { 0, __KERNEL_CS };
+static int pci_bios_present;
+static int __devinit check_pcibios(void)
+{
+        u32 signature, eax, ebx, ecx;
+        u8 status, major_ver, minor_ver, hw_mech;
+        unsigned long flags, pcibios_entry;
+        if ((pcibios_entry = bios32_service(PCI_SERVICE))) {
+                pci_indirect.address = pcibios_entry + PAGE_OFFSET;
+                local_irq_save(flags);
+                __asm__(
+                        "lcall *(%%edi); cld\n\t"
+                        "jc 1f\n\t"
+                        "xor %%ah, %%ah\n"
+                        "1:"
+                        : "=d" (signature),
+                          "=a" (eax),
+                          "=b" (ebx),
+                          "=c" (ecx)
+                        : "1" (PCIBIOS_PCI_BIOS_PRESENT),
+                          "D" (&pci_indirect)
+                        : "memory");
+                local_irq_restore(flags);
+                status = (eax >> 8) & 0xff;
+                hw_mech = eax & 0xff;
+                major_ver = (ebx >> 8) & 0xff;
+                minor_ver = ebx & 0xff;
+                if (pcibios_last_bus < 0)
+                        pcibios_last_bus = ecx & 0xff;
+                DBG("PCI: BIOS probe returned s=%02x hw=%02x ver=%02x.%02x l=%02x\n",
+                        status, hw_mech, major_ver, minor_ver, pcibios_last_bus);
+                if (status || signature != PCI_SIGNATURE) {
+                        printk (KERN_ERR "PCI: BIOS BUG #%x[%08x] found\n",
+                                status, signature);
+                        return 0;
+                }
+                printk(KERN_INFO "PCI: PCI BIOS revision %x.%02x entry at 0x%lx, last bus=%d\n",
+                        major_ver, minor_ver, pcibios_entry, pcibios_last_bus);
+#ifdef CONFIG_PCI_DIRECT
+                if (!(hw_mech & PCIBIOS_HW_TYPE1))
+                        pci_probe &= ~PCI_PROBE_CONF1;
+                if (!(hw_mech & PCIBIOS_HW_TYPE2))
+                        pci_probe &= ~PCI_PROBE_CONF2;
+#endif
+                return 1;
+        }
+        return 0;
+}
+static int __devinit pci_bios_find_device (unsigned short vendor, unsigned short device_id,
+                                        unsigned short index, unsigned char *bus, unsigned char *device_fn)
+{
+        unsigned short bx;
+        unsigned short ret;
+        __asm__("lcall *(%%edi); cld\n\t"
+                "jc 1f\n\t"
+                "xor %%ah, %%ah\n"
+                "1:"
+                : "=b" (bx),
+                  "=a" (ret)
+                : "1" (PCIBIOS_FIND_PCI_DEVICE),
+                  "c" (device_id),
+                  "d" (vendor),
+                  "S" ((int) index),
+                  "D" (&pci_indirect));
+        *bus = (bx >> 8) & 0xff;
+        *device_fn = bx & 0xff;
+        return (int) (ret & 0xff00) >> 8;
+}
+static int pci_bios_read(unsigned int seg, unsigned int bus,
+                         unsigned int devfn, int reg, int len, u32 *value)
+{
+        unsigned long result = 0;
+        unsigned long flags;
+        unsigned long bx = (bus << 8) | devfn;
+        if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
+                return -EINVAL;
+        spin_lock_irqsave(&pci_config_lock, flags);
+        switch (len) {
+        case 1:
+                __asm__("lcall *(%%esi); cld\n\t"
+                        "jc 1f\n\t"
+                        "xor %%ah, %%ah\n"
+                        "1:"
+                        : "=c" (*value),
+                          "=a" (result)
+                        : "1" (PCIBIOS_READ_CONFIG_BYTE),
+                          "b" (bx),
+                          "D" ((long)reg),
+                          "S" (&pci_indirect));
+                break;
+        case 2:
+                __asm__("lcall *(%%esi); cld\n\t"
+                        "jc 1f\n\t"
+                        "xor %%ah, %%ah\n"
+                        "1:"
+                        : "=c" (*value),
+                          "=a" (result)
+                        : "1" (PCIBIOS_READ_CONFIG_WORD),
+                          "b" (bx),
+                          "D" ((long)reg),
+                          "S" (&pci_indirect));
+                break;
+        case 4:
+                __asm__("lcall *(%%esi); cld\n\t"
+                        "jc 1f\n\t"
+                        "xor %%ah, %%ah\n"
+                        "1:"
+                        : "=c" (*value),
+                          "=a" (result)
+                        : "1" (PCIBIOS_READ_CONFIG_DWORD),
+                          "b" (bx),
+                          "D" ((long)reg),
+                          "S" (&pci_indirect));
+                break;
+        }
+        spin_unlock_irqrestore(&pci_config_lock, flags);
+        return (int)((result & 0xff00) >> 8);
+}
+static int pci_bios_write(unsigned int seg, unsigned int bus,
+                          unsigned int devfn, int reg, int len, u32 value)
+{
+        unsigned long result = 0;
+        unsigned long flags;
+        unsigned long bx = (bus << 8) | devfn;
+        if ((bus > 255) || (devfn > 255) || (reg > 255)) 
+                return -EINVAL;
+        spin_lock_irqsave(&pci_config_lock, flags);
+        switch (len) {
+        case 1:
+                __asm__("lcall *(%%esi); cld\n\t"
+                        "jc 1f\n\t"
+                        "xor %%ah, %%ah\n"
+                        "1:"
+                        : "=a" (result)
+                        : "0" (PCIBIOS_WRITE_CONFIG_BYTE),
+                          "c" (value),
+                          "b" (bx),
+                          "D" ((long)reg),
+                          "S" (&pci_indirect));
+                break;
+        case 2:
+                __asm__("lcall *(%%esi); cld\n\t"
+                        "jc 1f\n\t"
+                        "xor %%ah, %%ah\n"
+                        "1:"
+                        : "=a" (result)
+                        : "0" (PCIBIOS_WRITE_CONFIG_WORD),
+                          "c" (value),
+                          "b" (bx),
+                          "D" ((long)reg),
+                          "S" (&pci_indirect));
+                break;
+        case 4:
+                __asm__("lcall *(%%esi); cld\n\t"
+                        "jc 1f\n\t"
+                        "xor %%ah, %%ah\n"
+                        "1:"
+                        : "=a" (result)
+                        : "0" (PCIBIOS_WRITE_CONFIG_DWORD),
+                          "c" (value),
+                          "b" (bx),
+                          "D" ((long)reg),
+                          "S" (&pci_indirect));
+                break;
+        }
+        spin_unlock_irqrestore(&pci_config_lock, flags);
+        return (int)((result & 0xff00) >> 8);
+}
+/*
+ * Function table for BIOS32 access
+ */
+static struct pci_raw_ops pci_bios_access = {
+        .read =         pci_bios_read,
+        .write =        pci_bios_write
+};
+/*
+ * Try to find PCI BIOS.
+ */
+static struct pci_raw_ops * __devinit pci_find_bios(void)
+{
+        union bios32 *check;
+        unsigned char sum;
+        int i, length;
+        /*
+         * Follow the standard procedure for locating the BIOS32 Service
+         * directory by scanning the permissible address range from
+         * 0xe0000 through 0xfffff for a valid BIOS32 structure.
+         */
+        for (check = (union bios32 *) __va(0xe0000);
+             check <= (union bios32 *) __va(0xffff0);
+             ++check) {
+                long sig;
+                if (probe_kernel_address(&check->fields.signature, sig))
+                        continue;
+                if (check->fields.signature != BIOS32_SIGNATURE)
+                        continue;
+                length = check->fields.length * 16;
+                if (!length)
+                        continue;
+                sum = 0;
+                for (i = 0; i < length ; ++i)
+                        sum += check->chars[i];
+                if (sum != 0)
+                        continue;
+                if (check->fields.revision != 0) {
+                        printk("PCI: unsupported BIOS32 revision %d at 0x%p\n",
+                                check->fields.revision, check);
+                        continue;
+                }
+                DBG("PCI: BIOS32 Service Directory structure at 0x%p\n", check);
+                if (check->fields.entry >= 0x100000) {
+                        printk("PCI: BIOS32 entry (0x%p) in high memory, "
+                                        "cannot use.\n", check);
+                        return NULL;
+                } else {
+                        unsigned long bios32_entry = check->fields.entry;
+                        DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
+                                        bios32_entry);
+                        bios32_indirect.address = bios32_entry + PAGE_OFFSET;
+                        if (check_pcibios())
+                                return &pci_bios_access;
+                }
+                break;  /* Hopefully more than one BIOS32 cannot happen... */
+        }
+        return NULL;
+}
+/*
+ * Sort the device list according to PCI BIOS. Nasty hack, but since some
+ * fool forgot to define the `correct' device order in the PCI BIOS specs
+ * and we want to be (possibly bug-to-bug ;-]) compatible with older kernels
+ * which used BIOS ordering, we are bound to do this...
+ */
+void __devinit pcibios_sort(void)
+{
+        LIST_HEAD(sorted_devices);
+        struct list_head *ln;
+        struct pci_dev *dev, *d;
+        int idx, found;
+        unsigned char bus, devfn;
+        DBG("PCI: Sorting device list...\n");
+        while (!list_empty(&pci_devices)) {
+                ln = pci_devices.next;
+                dev = pci_dev_g(ln);
+                idx = found = 0;
+                while (pci_bios_find_device(dev->vendor, dev->device, idx, &bus, &devfn) == PCIBIOS_SUCCESSFUL) {
+                        idx++;
+                        list_for_each(ln, &pci_devices) {
+                                d = pci_dev_g(ln);
+                                if (d->bus->number == bus && d->devfn == devfn) {
+                                        list_move_tail(&d->global_list, &sorted_devices);
+                                        if (d == dev)
+                                                found = 1;
+                                        break;
+                                }
+                        }
+                        if (ln == &pci_devices) {
+                                printk(KERN_WARNING "PCI: BIOS reporting unknown device %02x:%02x\n", bus, devfn);
+                                /*
+                                 * We must not continue scanning as several buggy BIOSes
+                                 * return garbage after the last device. Grr.
+                                 */
+                                break;
+                        }
+                }
+                if (!found) {
+                        printk(KERN_WARNING "PCI: Device %s not found by BIOS\n",
+                                pci_name(dev));
+                        list_move_tail(&dev->global_list, &sorted_devices);
+                }
+        }
+        list_splice(&sorted_devices, &pci_devices);
+}
+/*
+ *  BIOS Functions for IRQ Routing
+ */
+struct irq_routing_options {
+        u16 size;
+        struct irq_info *table;
+        u16 segment;
+} __attribute__((packed));
+struct irq_routing_table * pcibios_get_irq_routing_table(void)
+{
+        struct irq_routing_options opt;
+        struct irq_routing_table *rt = NULL;
+        int ret, map;
+        unsigned long page;
+        if (!pci_bios_present)
+                return NULL;
+        page = __get_free_page(GFP_KERNEL);
+        if (!page)
+                return NULL;
+        opt.table = (struct irq_info *) page;
+        opt.size = PAGE_SIZE;
+        opt.segment = __KERNEL_DS;
+        DBG("PCI: Fetching IRQ routing table... ");
+        __asm__("push %%es\n\t"
+                "push %%ds\n\t"
+                "pop  %%es\n\t"
+                "lcall *(%%esi); cld\n\t"
+                "pop %%es\n\t"
+                "jc 1f\n\t"
+                "xor %%ah, %%ah\n"
+                "1:"
+                : "=a" (ret),
+                  "=b" (map),
+                  "=m" (opt)
+                : "0" (PCIBIOS_GET_ROUTING_OPTIONS),
+                  "1" (0),
+                  "D" ((long) &opt),
+                  "S" (&pci_indirect),
+                  "m" (opt)
+                : "memory");
+        DBG("OK  ret=%d, size=%d, map=%x\n", ret, opt.size, map);
+        if (ret & 0xff00)
+                printk(KERN_ERR "PCI: Error %02x when fetching IRQ routing table.\n", (ret >> 8) & 0xff);
+        else if (opt.size) {
+                rt = kmalloc(sizeof(struct irq_routing_table) + opt.size, GFP_KERNEL);
+                if (rt) {
+                        memset(rt, 0, sizeof(struct irq_routing_table));
+                        rt->size = opt.size + sizeof(struct irq_routing_table);
+                        rt->exclusive_irqs = map;
+                        memcpy(rt->slots, (void *) page, opt.size);
+                        printk(KERN_INFO "PCI: Using BIOS Interrupt Routing Table\n");
+                }
+        }
+        free_page(page);
+        return rt;
+}
+EXPORT_SYMBOL(pcibios_get_irq_routing_table);
+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq)
+{
+        int ret;
+        __asm__("lcall *(%%esi); cld\n\t"
+                "jc 1f\n\t"
+                "xor %%ah, %%ah\n"
+                "1:"
+                : "=a" (ret)
+                : "0" (PCIBIOS_SET_PCI_HW_INT),
+                  "b" ((dev->bus->number << 8) | dev->devfn),
+                  "c" ((irq << 8) | (pin + 10)),
+                  "S" (&pci_indirect));
+        return !(ret & 0xff00);
+}
+EXPORT_SYMBOL(pcibios_set_irq_routing);
+void __init pci_pcbios_init(void)
+{
+        if ((pci_probe & PCI_PROBE_BIOS) 
+                && ((raw_pci_ops = pci_find_bios()))) {
+                pci_probe |= PCI_BIOS_SORT;
+                pci_bios_present = 1;
+        }
+}
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
new file mode 100644
index 000000000000..8c66f275756f
--- /dev/null
+++ b/arch/x86/pci/pci.h
@@ -0,0 +1,149 @@
+/*
+ *      Low-Level PCI Access for i386 machines.
+ *
+ *      (c) 1999 Martin Mares <mj@ucw.cz>
+ */
+#undef DEBUG
+#ifdef DEBUG
+#define DBG(x...) printk(x)
+#else
+#define DBG(x...)
+#endif
+#define PCI_PROBE_BIOS          0x0001
+#define PCI_PROBE_CONF1         0x0002
+#define PCI_PROBE_CONF2         0x0004
+#define PCI_PROBE_MMCONF        0x0008
+#define PCI_PROBE_MASK          0x000f
+#define PCI_PROBE_NOEARLY       0x0010
+#define PCI_NO_SORT             0x0100
+#define PCI_BIOS_SORT           0x0200
+#define PCI_NO_CHECKS           0x0400
+#define PCI_USE_PIRQ_MASK       0x0800
+#define PCI_ASSIGN_ROMS         0x1000
+#define PCI_BIOS_IRQ_SCAN       0x2000
+#define PCI_ASSIGN_ALL_BUSSES   0x4000
+extern unsigned int pci_probe;
+extern unsigned long pirq_table_addr;
+enum pci_bf_sort_state {
+        pci_bf_sort_default,
+        pci_force_nobf,
+        pci_force_bf,
+        pci_dmi_bf,
+};
+/* pci-i386.c */
+extern unsigned int pcibios_max_latency;
+void pcibios_resource_survey(void);
+int pcibios_enable_resources(struct pci_dev *, int);
+/* pci-pc.c */
+extern int pcibios_last_bus;
+extern struct pci_bus *pci_root_bus;
+extern struct pci_ops pci_root_ops;
+/* pci-irq.c */
+struct irq_info {
+        u8 bus, devfn;                  /* Bus, device and function */
+        struct {
+                u8 link;                /* IRQ line ID, chipset dependent, 0=not routed */
+                u16 bitmap;             /* Available IRQs */
+        } __attribute__((packed)) irq[4];
+        u8 slot;                        /* Slot number, 0=onboard */
+        u8 rfu;
+} __attribute__((packed));
+struct irq_routing_table {
+        u32 signature;                  /* PIRQ_SIGNATURE should be here */
+        u16 version;                    /* PIRQ_VERSION */
+        u16 size;                       /* Table size in bytes */
+        u8 rtr_bus, rtr_devfn;          /* Where the interrupt router lies */
+        u16 exclusive_irqs;             /* IRQs devoted exclusively to PCI usage */
+        u16 rtr_vendor, rtr_device;     /* Vendor and device ID of interrupt router */
+        u32 miniport_data;              /* Crap */
+        u8 rfu[11];
+        u8 checksum;                    /* Modulo 256 checksum must give zero */
+        struct irq_info slots[0];
+} __attribute__((packed));
+extern unsigned int pcibios_irq_mask;
+extern int pcibios_scanned;
+extern spinlock_t pci_config_lock;
+extern int (*pcibios_enable_irq)(struct pci_dev *dev);
+extern void (*pcibios_disable_irq)(struct pci_dev *dev);
+extern int pci_conf1_write(unsigned int seg, unsigned int bus,
+                           unsigned int devfn, int reg, int len, u32 value);
+extern int pci_conf1_read(unsigned int seg, unsigned int bus,
+                          unsigned int devfn, int reg, int len, u32 *value);
+extern int pci_direct_probe(void);
+extern void pci_direct_init(int type);
+extern void pci_pcbios_init(void);
+extern void pci_mmcfg_init(int type);
+extern void pcibios_sort(void);
+/* pci-mmconfig.c */
+/* Verify the first 16 busses. We assume that systems with more busses
+   get MCFG right. */
+#define PCI_MMCFG_MAX_CHECK_BUS 16
+extern DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
+extern int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus,
+                                           unsigned int devfn);
+extern int __init pci_mmcfg_arch_init(void);
+/*
+ * AMD Fam10h CPUs are buggy, and cannot access MMIO config space
+ * on their northbrige except through the * %eax register. As such, you MUST
+ * NOT use normal IOMEM accesses, you need to only use the magic mmio-config
+ * accessor functions.
+ * In fact just use pci_config_*, nothing else please.
+ */
+static inline unsigned char mmio_config_readb(void __iomem *pos)
+{
+        u8 val;
+        asm volatile("movb (%1),%%al" : "=a" (val) : "r" (pos));
+        return val;
+}
+static inline unsigned short mmio_config_readw(void __iomem *pos)
+{
+        u16 val;
+        asm volatile("movw (%1),%%ax" : "=a" (val) : "r" (pos));
+        return val;
+}
+static inline unsigned int mmio_config_readl(void __iomem *pos)
+{
+        u32 val;
+        asm volatile("movl (%1),%%eax" : "=a" (val) : "r" (pos));
+        return val;
+}
+static inline void mmio_config_writeb(void __iomem *pos, u8 val)
+{
+        asm volatile("movb %%al,(%1)" :: "a" (val), "r" (pos) : "memory");
+}
+static inline void mmio_config_writew(void __iomem *pos, u16 val)
+{
+        asm volatile("movw %%ax,(%1)" :: "a" (val), "r" (pos) : "memory");
+}
+static inline void mmio_config_writel(void __iomem *pos, u32 val)
+{
+        asm volatile("movl %%eax,(%1)" :: "a" (val), "r" (pos) : "memory");
+}
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
new file mode 100644
index 000000000000..8ecb1c722594
--- /dev/null
+++ b/arch/x86/pci/visws.c
@@ -0,0 +1,111 @@
+/*
+ *      Low-Level PCI Support for SGI Visual Workstation
+ *
+ *      (c) 1999--2000 Martin Mares <mj@ucw.cz>
+ */
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include "cobalt.h"
+#include "lithium.h"
+#include "pci.h"
+extern struct pci_raw_ops pci_direct_conf1;
+static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; }
+static void pci_visws_disable_irq(struct pci_dev *dev) { }
+int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq;
+void (*pcibios_disable_irq)(struct pci_dev *dev) = &pci_visws_disable_irq;
+void __init pcibios_penalize_isa_irq(int irq, int active) {}
+unsigned int pci_bus0, pci_bus1;
+static inline u8 bridge_swizzle(u8 pin, u8 slot) 
+{
+        return (((pin - 1) + slot) % 4) + 1;
+}
+static u8 __init visws_swizzle(struct pci_dev *dev, u8 *pinp)
+{
+        u8 pin = *pinp;
+        while (dev->bus->self) {        /* Move up the chain of bridges. */
+                pin = bridge_swizzle(pin, PCI_SLOT(dev->devfn));
+                dev = dev->bus->self;
+        }
+        *pinp = pin;
+        return PCI_SLOT(dev->devfn);
+}
+static int __init visws_map_irq(struct pci_dev *dev, u8 slot, u8 pin)
+{
+        int irq, bus = dev->bus->number;
+        pin--;
+        /* Nothing useful at PIIX4 pin 1 */
+        if (bus == pci_bus0 && slot == 4 && pin == 0)
+                return -1;
+        /* PIIX4 USB is on Bus 0, Slot 4, Line 3 */
+        if (bus == pci_bus0 && slot == 4 && pin == 3) {
+                irq = CO_IRQ(CO_APIC_PIIX4_USB);
+                goto out;
+        }
+        /* First pin spread down 1 APIC entry per slot */
+        if (pin == 0) {
+                irq = CO_IRQ((bus == pci_bus0 ? CO_APIC_PCIB_BASE0 :
+                                                CO_APIC_PCIA_BASE0) + slot);
+                goto out;
+        }
+        /* lines 1,2,3 from any slot is shared in this twirly pattern */
+        if (bus == pci_bus1) {
+                /* lines 1-3 from devices 0 1 rotate over 2 apic entries */
+                irq = CO_IRQ(CO_APIC_PCIA_BASE123 + ((slot + (pin - 1)) % 2));
+        } else { /* bus == pci_bus0 */
+                /* lines 1-3 from devices 0-3 rotate over 3 apic entries */
+                if (slot == 0)
+                        slot = 3; /* same pattern */
+                irq = CO_IRQ(CO_APIC_PCIA_BASE123 + ((3 - slot) + (pin - 1) % 3));
+        }
+out:
+        printk(KERN_DEBUG "PCI: Bus %d Slot %d Line %d -> IRQ %d\n", bus, slot, pin, irq);
+        return irq;
+}
+void __init pcibios_update_irq(struct pci_dev *dev, int irq)
+{
+        pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
+}
+static int __init pcibios_init(void)
+{
+        /* The VISWS supports configuration access type 1 only */
+        pci_probe = (pci_probe | PCI_PROBE_CONF1) &
+                    ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
+        pci_bus0 = li_pcib_read16(LI_PCI_BUSNUM) & 0xff;
+        pci_bus1 = li_pcia_read16(LI_PCI_BUSNUM) & 0xff;
+        printk(KERN_INFO "PCI: Lithium bridge A bus: %u, "
+                "bridge B (PIIX4) bus: %u\n", pci_bus1, pci_bus0);
+        raw_pci_ops = &pci_direct_conf1;
+        pci_scan_bus_with_sysdata(pci_bus0);
+        pci_scan_bus_with_sysdata(pci_bus1);
+        pci_fixup_irqs(visws_swizzle, visws_map_irq);
+        pcibios_resource_survey();
+        return 0;
+}
+subsys_initcall(pcibios_init);
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
new file mode 100644
index 000000000000..d764ec950065
--- /dev/null
+++ b/arch/x86/power/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_PM)                += cpu.o
+obj-$(CONFIG_HIBERNATION)       += swsusp.o suspend.o
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
new file mode 100644
index 000000000000..998fd3ec0d68
--- /dev/null
+++ b/arch/x86/power/cpu.c
@@ -0,0 +1,133 @@
+/*
+ * Suspend support specific for i386.
+ *
+ * Distribute under GPLv2
+ *
+ * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
+ */
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <asm/mtrr.h>
+#include <asm/mce.h>
+static struct saved_context saved_context;
+unsigned long saved_context_ebx;
+unsigned long saved_context_esp, saved_context_ebp;
+unsigned long saved_context_esi, saved_context_edi;
+unsigned long saved_context_eflags;
+void __save_processor_state(struct saved_context *ctxt)
+{
+        mtrr_save_fixed_ranges(NULL);
+        kernel_fpu_begin();
+        /*
+         * descriptor tables
+         */
+        store_gdt(&ctxt->gdt);
+        store_idt(&ctxt->idt);
+        store_tr(ctxt->tr);
+        /*
+         * segment registers
+         */
+        savesegment(es, ctxt->es);
+        savesegment(fs, ctxt->fs);
+        savesegment(gs, ctxt->gs);
+        savesegment(ss, ctxt->ss);
+        /*
+         * control registers 
+         */
+        ctxt->cr0 = read_cr0();
+        ctxt->cr2 = read_cr2();
+        ctxt->cr3 = read_cr3();
+        ctxt->cr4 = read_cr4();
+}
+void save_processor_state(void)
+{
+        __save_processor_state(&saved_context);
+}
+static void do_fpu_end(void)
+{
+        /*
+         * Restore FPU regs if necessary.
+         */
+        kernel_fpu_end();
+}
+static void fix_processor_context(void)
+{
+        int cpu = smp_processor_id();
+        struct tss_struct * t = &per_cpu(init_tss, cpu);
+        set_tss_desc(cpu,t);    /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
+        load_TR_desc();                         /* This does ltr */
+        load_LDT(&current->active_mm->context); /* This does lldt */
+        /*
+         * Now maybe reload the debug registers
+         */
+        if (current->thread.debugreg[7]){
+                set_debugreg(current->thread.debugreg[0], 0);
+                set_debugreg(current->thread.debugreg[1], 1);
+                set_debugreg(current->thread.debugreg[2], 2);
+                set_debugreg(current->thread.debugreg[3], 3);
+                /* no 4 and 5 */
+                set_debugreg(current->thread.debugreg[6], 6);
+                set_debugreg(current->thread.debugreg[7], 7);
+        }
+}
+void __restore_processor_state(struct saved_context *ctxt)
+{
+        /*
+         * control registers
+         */
+        write_cr4(ctxt->cr4);
+        write_cr3(ctxt->cr3);
+        write_cr2(ctxt->cr2);
+        write_cr0(ctxt->cr0);
+        /*
+         * now restore the descriptor tables to their proper values
+         * ltr is done i fix_processor_context().
+         */
+        load_gdt(&ctxt->gdt);
+        load_idt(&ctxt->idt);
+        /*
+         * segment registers
+         */
+        loadsegment(es, ctxt->es);
+        loadsegment(fs, ctxt->fs);
+        loadsegment(gs, ctxt->gs);
+        loadsegment(ss, ctxt->ss);
+        /*
+         * sysenter MSRs
+         */
+        if (boot_cpu_has(X86_FEATURE_SEP))
+                enable_sep_cpu();
+        fix_processor_context();
+        do_fpu_end();
+        mtrr_ap_init();
+        mcheck_init(&boot_cpu_data);
+}
+void restore_processor_state(void)
+{
+        __restore_processor_state(&saved_context);
+}
+/* Needed by apm.c */
+EXPORT_SYMBOL(save_processor_state);
+EXPORT_SYMBOL(restore_processor_state);
diff --git a/arch/x86/power/suspend.c b/arch/x86/power/suspend.c
new file mode 100644
index 000000000000..a0020b913f31
--- /dev/null
+++ b/arch/x86/power/suspend.c
@@ -0,0 +1,172 @@
+/*
+ * Suspend support specific for i386 - temporary page tables
+ *
+ * Distribute under GPLv2
+ *
+ * Copyright (c) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ */
+#include <linux/suspend.h>
+#include <linux/bootmem.h>
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+/* Defined in arch/i386/power/swsusp.S */
+extern int restore_image(void);
+/* References to section boundaries */
+extern const void __nosave_begin, __nosave_end;
+/* Pointer to the temporary resume page tables */
+pgd_t *resume_pg_dir;
+/* The following three functions are based on the analogous code in
+ * arch/i386/mm/init.c
+ */
+/*
+ * Create a middle page table on a resume-safe page and put a pointer to it in
+ * the given global directory entry.  This only returns the gd entry
+ * in non-PAE compilation mode, since the middle layer is folded.
+ */
+static pmd_t *resume_one_md_table_init(pgd_t *pgd)
+{
+        pud_t *pud;
+        pmd_t *pmd_table;
+#ifdef CONFIG_X86_PAE
+        pmd_table = (pmd_t *)get_safe_page(GFP_ATOMIC);
+        if (!pmd_table)
+                return NULL;
+        set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+        pud = pud_offset(pgd, 0);
+        BUG_ON(pmd_table != pmd_offset(pud, 0));
+#else
+        pud = pud_offset(pgd, 0);
+        pmd_table = pmd_offset(pud, 0);
+#endif
+        return pmd_table;
+}
+/*
+ * Create a page table on a resume-safe page and place a pointer to it in
+ * a middle page directory entry.
+ */
+static pte_t *resume_one_page_table_init(pmd_t *pmd)
+{
+        if (pmd_none(*pmd)) {
+                pte_t *page_table = (pte_t *)get_safe_page(GFP_ATOMIC);
+                if (!page_table)
+                        return NULL;
+                set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+                BUG_ON(page_table != pte_offset_kernel(pmd, 0));
+                return page_table;
+        }
+        return pte_offset_kernel(pmd, 0);
+}
+/*
+ * This maps the physical memory to kernel virtual address space, a total
+ * of max_low_pfn pages, by creating page tables starting from address
+ * PAGE_OFFSET.  The page tables are allocated out of resume-safe pages.
+ */
+static int resume_physical_mapping_init(pgd_t *pgd_base)
+{
+        unsigned long pfn;
+        pgd_t *pgd;
+        pmd_t *pmd;
+        pte_t *pte;
+        int pgd_idx, pmd_idx;
+        pgd_idx = pgd_index(PAGE_OFFSET);
+        pgd = pgd_base + pgd_idx;
+        pfn = 0;
+        for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+                pmd = resume_one_md_table_init(pgd);
+                if (!pmd)
+                        return -ENOMEM;
+                if (pfn >= max_low_pfn)
+                        continue;
+                for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD; pmd++, pmd_idx++) {
+                        if (pfn >= max_low_pfn)
+                                break;
+                        /* Map with big pages if possible, otherwise create
+                         * normal page tables.
+                         * NOTE: We can mark everything as executable here
+                         */
+                        if (cpu_has_pse) {
+                                set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
+                                pfn += PTRS_PER_PTE;
+                        } else {
+                                pte_t *max_pte;
+                                pte = resume_one_page_table_init(pmd);
+                                if (!pte)
+                                        return -ENOMEM;
+                                max_pte = pte + PTRS_PER_PTE;
+                                for (; pte < max_pte; pte++, pfn++) {
+                                        if (pfn >= max_low_pfn)
+                                                break;
+                                        set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
+                                }
+                        }
+                }
+        }
+        return 0;
+}
+static inline void resume_init_first_level_page_table(pgd_t *pg_dir)
+{
+#ifdef CONFIG_X86_PAE
+        int i;
+        /* Init entries of the first-level page table to the zero page */
+        for (i = 0; i < PTRS_PER_PGD; i++)
+                set_pgd(pg_dir + i,
+                        __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
+#endif
+}
+int swsusp_arch_resume(void)
+{
+        int error;
+        resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
+        if (!resume_pg_dir)
+                return -ENOMEM;
+        resume_init_first_level_page_table(resume_pg_dir);
+        error = resume_physical_mapping_init(resume_pg_dir);
+        if (error)
+                return error;
+        /* We have got enough memory and from now on we cannot recover */
+        restore_image();
+        return 0;
+}
+/*
+ *      pfn_is_nosave - check if given pfn is in the 'nosave' section
+ */
+int pfn_is_nosave(unsigned long pfn)
+{
+        unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
+        unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
+        return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
diff --git a/arch/x86/power/swsusp.S b/arch/x86/power/swsusp.S
new file mode 100644
index 000000000000..53662e05b393
--- /dev/null
+++ b/arch/x86/power/swsusp.S
@@ -0,0 +1,78 @@
+.text
+/* Originally gcc generated, modified by hand
+ *
+ * This may not use any stack, nor any variable that is not "NoSave":
+ *
+ * Its rewriting one kernel image with another. What is stack in "old"
+ * image could very well be data page in "new" image, and overwriting
+ * your own stack under you is bad idea.
+ */
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+        .text
+ENTRY(swsusp_arch_suspend)
+        movl %esp, saved_context_esp
+        movl %ebx, saved_context_ebx
+        movl %ebp, saved_context_ebp
+        movl %esi, saved_context_esi
+        movl %edi, saved_context_edi
+        pushfl ; popl saved_context_eflags
+        call swsusp_save
+        ret
+ENTRY(restore_image)
+        movl    resume_pg_dir, %ecx
+        subl    $__PAGE_OFFSET, %ecx
+        movl    %ecx, %cr3
+        movl    restore_pblist, %edx
+        .p2align 4,,7
+copy_loop:
+        testl   %edx, %edx
+        jz      done
+        movl    pbe_address(%edx), %esi
+        movl    pbe_orig_address(%edx), %edi
+        movl    $1024, %ecx
+        rep
+        movsl
+        movl    pbe_next(%edx), %edx
+        jmp     copy_loop
+        .p2align 4,,7
+done:
+        /* go back to the original page tables */
+        movl    $swapper_pg_dir, %ecx
+        subl    $__PAGE_OFFSET, %ecx
+        movl    %ecx, %cr3
+        /* Flush TLB, including "global" things (vmalloc) */
+        movl    mmu_cr4_features, %eax
+        movl    %eax, %edx
+        andl    $~(1<<7), %edx;  # PGE
+        movl    %edx, %cr4;  # turn off PGE
+        movl    %cr3, %ecx;  # flush TLB
+        movl    %ecx, %cr3
+        movl    %eax, %cr4;  # turn PGE back on
+        movl saved_context_esp, %esp
+        movl saved_context_ebp, %ebp
+        movl saved_context_ebx, %ebx
+        movl saved_context_esi, %esi
+        movl saved_context_edi, %edi
+        pushl saved_context_eflags ; popfl
+        xorl    %eax, %eax
+        ret
diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore
new file mode 100644
index 000000000000..f8b69d84238e
--- /dev/null
+++ b/arch/x86/vdso/.gitignore
@@ -0,0 +1 @@
+vdso.lds
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
new file mode 100644
index 000000000000..8d03de029d9b
--- /dev/null
+++ b/arch/x86/vdso/Makefile
@@ -0,0 +1,49 @@
+#
+# x86-64 vDSO.
+#
+# files to link into the vdso
+# vdso-start.o has to be first
+vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
+# files to link into kernel
+obj-y := vma.o vdso.o vdso-syms.o
+vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
+$(obj)/vdso.o: $(obj)/vdso.so
+targets += vdso.so vdso.lds $(vobjs-y) vdso-syms.o
+# The DSO images are built using a special linker script.
+quiet_cmd_syscall = SYSCALL $@
+      cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \
+                          -Wl,-T,$(filter-out FORCE,$^) -o $@
+export CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
+vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \
+                 $(call ld-option, -Wl$(comma)--hash-style=sysv) \
+                -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
+SYSCFLAGS_vdso.so = $(vdso-flags)
+$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
+$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE
+        $(call if_changed,syscall)
+CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64
+$(obj)/vclock_gettime.o: CFLAGS = $(CFL)
+$(obj)/vgetcpu.o: CFLAGS = $(CFL)
+# We also create a special relocatable object that should mirror the symbol
+# table and layout of the linked DSO.  With ld -R we can then refer to
+# these symbols in the kernel code rather than hand-coded addresses.
+extra-y += vdso-syms.o
+$(obj)/built-in.o: $(obj)/vdso-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o
+SYSCFLAGS_vdso-syms.o = -r -d
+$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE
+        $(call if_changed,syscall)
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
new file mode 100644
index 000000000000..5b54cdfb2b07
--- /dev/null
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Fast user context implementation of clock_gettime and gettimeofday.
+ *
+ * The code should have no internal unresolved relocations.
+ * Check with readelf after changing.
+ * Also alternative() doesn't work.
+ */
+#include <linux/kernel.h>
+#include <linux/posix-timers.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <asm/vsyscall.h>
+#include <asm/vgtod.h>
+#include <asm/timex.h>
+#include <asm/hpet.h>
+#include <asm/unistd.h>
+#include <asm/io.h>
+#include <asm/vgtod.h>
+#include "vextern.h"
+#define gtod vdso_vsyscall_gtod_data
+static long vdso_fallback_gettime(long clock, struct timespec *ts)
+{
+        long ret;
+        asm("syscall" : "=a" (ret) :
+            "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
+        return ret;
+}
+static inline long vgetns(void)
+{
+        long v;
+        cycles_t (*vread)(void);
+        vread = gtod->clock.vread;
+        v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask;
+        return (v * gtod->clock.mult) >> gtod->clock.shift;
+}
+static noinline int do_realtime(struct timespec *ts)
+{
+        unsigned long seq, ns;
+        do {
+                seq = read_seqbegin(&gtod->lock);
+                ts->tv_sec = gtod->wall_time_sec;
+                ts->tv_nsec = gtod->wall_time_nsec;
+                ns = vgetns();
+        } while (unlikely(read_seqretry(&gtod->lock, seq)));
+        timespec_add_ns(ts, ns);
+        return 0;
+}
+/* Copy of the version in kernel/time.c which we cannot directly access */
+static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
+{
+        while (nsec >= NSEC_PER_SEC) {
+                nsec -= NSEC_PER_SEC;
+                ++sec;
+        }
+        while (nsec < 0) {
+                nsec += NSEC_PER_SEC;
+                --sec;
+        }
+        ts->tv_sec = sec;
+        ts->tv_nsec = nsec;
+}
+static noinline int do_monotonic(struct timespec *ts)
+{
+        unsigned long seq, ns, secs;
+        do {
+                seq = read_seqbegin(&gtod->lock);
+                secs = gtod->wall_time_sec;
+                ns = gtod->wall_time_nsec + vgetns();
+                secs += gtod->wall_to_monotonic.tv_sec;
+                ns += gtod->wall_to_monotonic.tv_nsec;
+        } while (unlikely(read_seqretry(&gtod->lock, seq)));
+        vset_normalized_timespec(ts, secs, ns);
+        return 0;
+}
+int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+{
+        if (likely(gtod->sysctl_enabled && gtod->clock.vread))
+                switch (clock) {
+                case CLOCK_REALTIME:
+                        return do_realtime(ts);
+                case CLOCK_MONOTONIC:
+                        return do_monotonic(ts);
+                }
+        return vdso_fallback_gettime(clock, ts);
+}
+int clock_gettime(clockid_t, struct timespec *)
+        __attribute__((weak, alias("__vdso_clock_gettime")));
+int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+        long ret;
+        if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
+                BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
+                             offsetof(struct timespec, tv_nsec) ||
+                             sizeof(*tv) != sizeof(struct timespec));
+                do_realtime((struct timespec *)tv);
+                tv->tv_usec /= 1000;
+                if (unlikely(tz != NULL)) {
+                        /* This relies on gcc inlining the memcpy. We'll notice
+                           if it ever fails to do so. */
+                        memcpy(tz, &gtod->sys_tz, sizeof(struct timezone));
+                }
+                return 0;
+        }
+        asm("syscall" : "=a" (ret) :
+            "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
+        return ret;
+}
+int gettimeofday(struct timeval *, struct timezone *)
+        __attribute__((weak, alias("__vdso_gettimeofday")));
diff --git a/arch/x86/vdso/vdso-note.S b/arch/x86/vdso/vdso-note.S
new file mode 100644
index 000000000000..79a071e4357e
--- /dev/null
+++ b/arch/x86/vdso/vdso-note.S
@@ -0,0 +1,12 @@
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+ELFNOTE_START(Linux, 0, "a")
+        .long LINUX_VERSION_CODE
+ELFNOTE_END
diff --git a/arch/x86/vdso/vdso-start.S b/arch/x86/vdso/vdso-start.S
new file mode 100644
index 000000000000..2dc2cdb84d67
--- /dev/null
+++ b/arch/x86/vdso/vdso-start.S
@@ -0,0 +1,2 @@
+        .globl vdso_kernel_start
+vdso_kernel_start:
diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S
new file mode 100644
index 000000000000..4b1620a1529e
--- /dev/null
+++ b/arch/x86/vdso/vdso.S
@@ -0,0 +1,2 @@
+        .section ".vdso","a"
+        .incbin "arch/x86/vdso/vdso.so"
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
new file mode 100644
index 000000000000..b9a60e665d08
--- /dev/null
+++ b/arch/x86/vdso/vdso.lds.S
@@ -0,0 +1,77 @@
+/*
+ * Linker script for vsyscall DSO.  The vsyscall page is an ELF shared
+ * object prelinked to its virtual address, and with only one read-only
+ * segment (that fits in one page).  This script controls its layout.
+ */
+#include <asm/asm-offsets.h>
+#include "voffset.h"
+#define VDSO_PRELINK 0xffffffffff700000
+SECTIONS
+{
+  . = VDSO_PRELINK + SIZEOF_HEADERS;
+  .hash           : { *(.hash) }                :text
+  .gnu.hash       : { *(.gnu.hash) }
+  .dynsym         : { *(.dynsym) }
+  .dynstr         : { *(.dynstr) }
+  .gnu.version    : { *(.gnu.version) }
+  .gnu.version_d  : { *(.gnu.version_d) }
+  .gnu.version_r  : { *(.gnu.version_r) }
+  /* This linker script is used both with -r and with -shared.
+     For the layouts to match, we need to skip more than enough
+     space for the dynamic symbol table et al.  If this amount
+     is insufficient, ld -shared will barf.  Just increase it here.  */
+  . = VDSO_PRELINK + VDSO_TEXT_OFFSET;
+  .text           : { *(.text) }                :text
+  .text.ptr       : { *(.text.ptr) }            :text
+  . = VDSO_PRELINK + 0x900;
+  .data           : { *(.data) }                :text
+  .bss            : { *(.bss) }                 :text
+  .altinstructions : { *(.altinstructions) }                    :text
+  .altinstr_replacement  : { *(.altinstr_replacement) } :text
+  .note           : { *(.note.*) }              :text :note
+  .eh_frame_hdr   : { *(.eh_frame_hdr) }        :text :eh_frame_hdr
+  .eh_frame       : { KEEP (*(.eh_frame)) }     :text
+  .dynamic        : { *(.dynamic) }             :text :dynamic
+  .useless        : {
+        *(.got.plt) *(.got)
+        *(.gnu.linkonce.d.*)
+        *(.dynbss)
+        *(.gnu.linkonce.b.*)
+  }                                             :text
+}
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+  note PT_NOTE FLAGS(4); /* PF_R */
+  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+}
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+  LINUX_2.6 {
+    global:
+        clock_gettime;
+        __vdso_clock_gettime;
+        gettimeofday;
+        __vdso_gettimeofday;
+        getcpu;
+        __vdso_getcpu;
+    local: *;
+  };
+}
diff --git a/arch/x86/vdso/vextern.h b/arch/x86/vdso/vextern.h
new file mode 100644
index 000000000000..1683ba2ae3e8
--- /dev/null
+++ b/arch/x86/vdso/vextern.h
@@ -0,0 +1,16 @@
+#ifndef VEXTERN
+#include <asm/vsyscall.h>
+#define VEXTERN(x) \
+        extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden")));
+#endif
+#define VMAGIC 0xfeedbabeabcdefabUL
+/* Any kernel variables used in the vDSO must be exported in the main
+   kernel's vmlinux.lds.S/vsyscall.h/proper __section and
+   put into vextern.h and be referenced as a pointer with vdso prefix.
+   The main kernel later fills in the values.   */
+VEXTERN(jiffies)
+VEXTERN(vgetcpu_mode)
+VEXTERN(vsyscall_gtod_data)
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
new file mode 100644
index 000000000000..91f6e85d0fc2
--- /dev/null
+++ b/arch/x86/vdso/vgetcpu.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Fast user context implementation of getcpu()
+ */
+#include <linux/kernel.h>
+#include <linux/getcpu.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <asm/vsyscall.h>
+#include <asm/vgtod.h>
+#include "vextern.h"
+long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+{
+        unsigned int dummy, p;
+        unsigned long j = 0;
+        /* Fast cache - only recompute value once per jiffies and avoid
+           relatively costly rdtscp/cpuid otherwise.
+           This works because the scheduler usually keeps the process
+           on the same CPU and this syscall doesn't guarantee its
+           results anyways.
+           We do this here because otherwise user space would do it on
+           its own in a likely inferior way (no access to jiffies).
+           If you don't like it pass NULL. */
+        if (tcache && tcache->blob[0] == (j = *vdso_jiffies)) {
+                p = tcache->blob[1];
+        } else if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
+                /* Load per CPU data from RDTSCP */
+                rdtscp(dummy, dummy, p);
+        } else {
+                /* Load per CPU data from GDT */
+                asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+        }
+        if (tcache) {
+                tcache->blob[0] = j;
+                tcache->blob[1] = p;
+        }
+        if (cpu)
+                *cpu = p & 0xfff;
+        if (node)
+                *node = p >> 12;
+        return 0;
+}
+long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+        __attribute__((weak, alias("__vdso_getcpu")));
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
new file mode 100644
index 000000000000..ff9333e5fb08
--- /dev/null
+++ b/arch/x86/vdso/vma.c
@@ -0,0 +1,140 @@
+/*
+ * Set up the VMAs to tell the VM about the vDSO.
+ * Copyright 2007 Andi Kleen, SUSE Labs.
+ * Subject to the GPL, v.2
+ */
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <asm/vsyscall.h>
+#include <asm/vgtod.h>
+#include <asm/proto.h>
+#include "voffset.h"
+int vdso_enabled = 1;
+#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x;
+#include "vextern.h"
+#undef VEXTERN
+extern char vdso_kernel_start[], vdso_start[], vdso_end[];
+extern unsigned short vdso_sync_cpuid;
+struct page **vdso_pages;
+static inline void *var_ref(void *vbase, char *var, char *name)
+{
+        unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET;
+        void *p = vbase + offset;
+        if (*(void **)p != (void *)VMAGIC) {
+                printk("VDSO: variable %s broken\n", name);
+                vdso_enabled = 0;
+        }
+        return p;
+}
+static int __init init_vdso_vars(void)
+{
+        int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
+        int i;
+        char *vbase;
+        vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
+        if (!vdso_pages)
+                goto oom;
+        for (i = 0; i < npages; i++) {
+                struct page *p;
+                p = alloc_page(GFP_KERNEL);
+                if (!p)
+                        goto oom;
+                vdso_pages[i] = p;
+                copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
+        }
+        vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL);
+        if (!vbase)
+                goto oom;
+        if (memcmp(vbase, "\177ELF", 4)) {
+                printk("VDSO: I'm broken; not ELF\n");
+                vdso_enabled = 0;
+        }
+#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x)
+#define VEXTERN(x) \
+        V(vdso_ ## x) = &__ ## x;
+#include "vextern.h"
+#undef VEXTERN
+        return 0;
+ oom:
+        printk("Cannot allocate vdso\n");
+        vdso_enabled = 0;
+        return -ENOMEM;
+}
+__initcall(init_vdso_vars);
+struct linux_binprm;
+/* Put the vdso above the (randomized) stack with another randomized offset.
+   This way there is no hole in the middle of address space.
+   To save memory make sure it is still in the same PTE as the stack top.
+   This doesn't give that many random bits */
+static unsigned long vdso_addr(unsigned long start, unsigned len)
+{
+        unsigned long addr, end;
+        unsigned offset;
+        end = (start + PMD_SIZE - 1) & PMD_MASK;
+        if (end >= TASK_SIZE64)
+                end = TASK_SIZE64;
+        end -= len;
+        /* This loses some more bits than a modulo, but is cheaper */
+        offset = get_random_int() & (PTRS_PER_PTE - 1);
+        addr = start + (offset << PAGE_SHIFT);
+        if (addr >= end)
+                addr = end;
+        return addr;
+}
+/* Setup a VMA at program startup for the vsyscall page.
+   Not called for compat tasks */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long addr;
+        int ret;
+        unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
+        if (!vdso_enabled)
+                return 0;
+        down_write(&mm->mmap_sem);
+        addr = vdso_addr(mm->start_stack, len);
+        addr = get_unmapped_area(NULL, addr, len, 0, 0);
+        if (IS_ERR_VALUE(addr)) {
+                ret = addr;
+                goto up_fail;
+        }
+        ret = install_special_mapping(mm, addr, len,
+                                      VM_READ|VM_EXEC|
+                                      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+                                      VM_ALWAYSDUMP,
+                                      vdso_pages);
+        if (ret)
+                goto up_fail;
+        current->mm->context.vdso = (void *)addr;
+up_fail:
+        up_write(&mm->mmap_sem);
+        return ret;
+}
+static __init int vdso_setup(char *s)
+{
+        vdso_enabled = simple_strtoul(s, NULL, 0);
+        return 0;
+}
+__setup("vdso=", vdso_setup);
diff --git a/arch/x86/vdso/voffset.h b/arch/x86/vdso/voffset.h
new file mode 100644
index 000000000000..4af67c79085f
--- /dev/null
+++ b/arch/x86/vdso/voffset.h
@@ -0,0 +1 @@
+#define VDSO_TEXT_OFFSET 0x600
diff --git a/arch/x86/vdso/vvar.c b/arch/x86/vdso/vvar.c
new file mode 100644
index 000000000000..6fc22219a472
--- /dev/null
+++ b/arch/x86/vdso/vvar.c
@@ -0,0 +1,12 @@
+/* Define pointer to external vDSO variables.
+   These are part of the vDSO. The kernel fills in the real addresses
+   at boot time. This is done because when the vdso is linked the
+   kernel isn't yet and we don't know the final addresses. */
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <asm/vsyscall.h>
+#include <asm/timex.h>
+#include <asm/vgtod.h>
+#define VEXTERN(x) typeof (__ ## x) *vdso_ ## x = (void *)VMAGIC;
+#include "vextern.h"
diff --git a/arch/x86/video/Makefile b/arch/x86/video/Makefile
new file mode 100644
index 000000000000..2c447c94adcc
--- /dev/null
+++ b/arch/x86/video/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_FB)               += fbdev.o
diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
new file mode 100644
index 000000000000..48fb38d7d2c0
--- /dev/null
+++ b/arch/x86/video/fbdev.c
@@ -0,0 +1,32 @@
+/*
+ * arch/i386/video/fbdev.c - i386 Framebuffer
+ *
+ * Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details.
+ *
+ */
+#include <linux/fb.h>
+#include <linux/pci.h>
+int fb_is_primary_device(struct fb_info *info)
+{
+        struct device *device = info->device;
+        struct pci_dev *pci_dev = NULL;
+        struct resource *res = NULL;
+        int retval = 0;
+        if (device)
+                pci_dev = to_pci_dev(device);
+        if (pci_dev)
+                res = &pci_dev->resource[PCI_ROM_RESOURCE];
+        if (res && res->flags & IORESOURCE_ROM_SHADOW)
+                retval = 1;
+        return retval;
+}
+EXPORT_SYMBOL(fb_is_primary_device);
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
new file mode 100644
index 000000000000..9df99e1885a4
--- /dev/null
+++ b/arch/x86/xen/Kconfig
@@ -0,0 +1,11 @@
+#
+# This Kconfig describes xen options
+#
+config XEN
+        bool "Enable support for Xen hypervisor"
+        depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
+        help
+          This is the Linux Xen port.  Enabling this will allow the
+          kernel to boot in a paravirtualized environment under the
+          Xen hypervisor.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
new file mode 100644
index 000000000000..343df246bd3e
--- /dev/null
+++ b/arch/x86/xen/Makefile
@@ -0,0 +1,4 @@
+obj-y           := enlighten.o setup.o features.o multicalls.o mmu.o \
+                        events.o time.o manage.o xen-asm.o
+obj-$(CONFIG_SMP)       += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
new file mode 100644
index 000000000000..f01bfcd4bdee
--- /dev/null
+++ b/arch/x86/xen/enlighten.c
@@ -0,0 +1,1146 @@
+/*
+ * Core of Xen paravirt_ops implementation.
+ *
+ * This file contains the xen_paravirt_ops structure itself, and the
+ * implementations for:
+ * - privileged instructions
+ * - interrupt flags
+ * - segment operations
+ * - booting and setup
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/preempt.h>
+#include <linux/hardirq.h>
+#include <linux/percpu.h>
+#include <linux/delay.h>
+#include <linux/start_kernel.h>
+#include <linux/sched.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/sched.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <asm/paravirt.h>
+#include <asm/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/reboot.h>
+#include "xen-ops.h"
+#include "mmu.h"
+#include "multicalls.h"
+EXPORT_SYMBOL_GPL(hypercall_page);
+DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DEFINE_PER_CPU(unsigned long, xen_cr3);
+struct start_info *xen_start_info;
+EXPORT_SYMBOL_GPL(xen_start_info);
+static /* __initdata */ struct shared_info dummy_shared_info;
+/*
+ * Point at some empty memory to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
+/*
+ * Flag to determine whether vcpu info placement is available on all
+ * VCPUs.  We assume it is to start with, and then set it to zero on
+ * the first failure.  This is because it can succeed on some VCPUs
+ * and not others, since it can involve hypervisor memory allocation,
+ * or because the guest failed to guarantee all the appropriate
+ * constraints on all VCPUs (ie buffer can't cross a page boundary).
+ *
+ * Note that any particular CPU may be using a placed vcpu structure,
+ * but we can only optimise if the all are.
+ *
+ * 0: not available, 1: available
+ */
+static int have_vcpu_info_placement = 1;
+static void __init xen_vcpu_setup(int cpu)
+{
+        struct vcpu_register_vcpu_info info;
+        int err;
+        struct vcpu_info *vcpup;
+        per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+        if (!have_vcpu_info_placement)
+                return;         /* already tested, not available */
+        vcpup = &per_cpu(xen_vcpu_info, cpu);
+        info.mfn = virt_to_mfn(vcpup);
+        info.offset = offset_in_page(vcpup);
+        printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+               cpu, vcpup, info.mfn, info.offset);
+        /* Check to see if the hypervisor will put the vcpu_info
+           structure where we want it, which allows direct access via
+           a percpu-variable. */
+        err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
+        if (err) {
+                printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
+                have_vcpu_info_placement = 0;
+        } else {
+                /* This cpu is using the registered vcpu info, even if
+                   later ones fail to. */
+                per_cpu(xen_vcpu, cpu) = vcpup;
+                printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
+                       cpu, vcpup);
+        }
+}
+static void __init xen_banner(void)
+{
+        printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+               paravirt_ops.name);
+        printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
+}
+static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+                      unsigned int *ecx, unsigned int *edx)
+{
+        unsigned maskedx = ~0;
+        /*
+         * Mask out inconvenient features, to try and disable as many
+         * unsupported kernel subsystems as possible.
+         */
+        if (*eax == 1)
+                maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
+                            (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
+                            (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+        asm(XEN_EMULATE_PREFIX "cpuid"
+                : "=a" (*eax),
+                  "=b" (*ebx),
+                  "=c" (*ecx),
+                  "=d" (*edx)
+                : "0" (*eax), "2" (*ecx));
+        *edx &= maskedx;
+}
+static void xen_set_debugreg(int reg, unsigned long val)
+{
+        HYPERVISOR_set_debugreg(reg, val);
+}
+static unsigned long xen_get_debugreg(int reg)
+{
+        return HYPERVISOR_get_debugreg(reg);
+}
+static unsigned long xen_save_fl(void)
+{
+        struct vcpu_info *vcpu;
+        unsigned long flags;
+        vcpu = x86_read_percpu(xen_vcpu);
+        /* flag has opposite sense of mask */
+        flags = !vcpu->evtchn_upcall_mask;
+        /* convert to IF type flag
+           -0 -> 0x00000000
+           -1 -> 0xffffffff
+        */
+        return (-flags) & X86_EFLAGS_IF;
+}
+static void xen_restore_fl(unsigned long flags)
+{
+        struct vcpu_info *vcpu;
+        /* convert from IF type flag */
+        flags = !(flags & X86_EFLAGS_IF);
+        /* There's a one instruction preempt window here.  We need to
+           make sure we're don't switch CPUs between getting the vcpu
+           pointer and updating the mask. */
+        preempt_disable();
+        vcpu = x86_read_percpu(xen_vcpu);
+        vcpu->evtchn_upcall_mask = flags;
+        preempt_enable_no_resched();
+        /* Doesn't matter if we get preempted here, because any
+           pending event will get dealt with anyway. */
+        if (flags == 0) {
+                preempt_check_resched();
+                barrier(); /* unmask then check (avoid races) */
+                if (unlikely(vcpu->evtchn_upcall_pending))
+                        force_evtchn_callback();
+        }
+}
+static void xen_irq_disable(void)
+{
+        /* There's a one instruction preempt window here.  We need to
+           make sure we're don't switch CPUs between getting the vcpu
+           pointer and updating the mask. */
+        preempt_disable();
+        x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+        preempt_enable_no_resched();
+}
+static void xen_irq_enable(void)
+{
+        struct vcpu_info *vcpu;
+        /* There's a one instruction preempt window here.  We need to
+           make sure we're don't switch CPUs between getting the vcpu
+           pointer and updating the mask. */
+        preempt_disable();
+        vcpu = x86_read_percpu(xen_vcpu);
+        vcpu->evtchn_upcall_mask = 0;
+        preempt_enable_no_resched();
+        /* Doesn't matter if we get preempted here, because any
+           pending event will get dealt with anyway. */
+        barrier(); /* unmask then check (avoid races) */
+        if (unlikely(vcpu->evtchn_upcall_pending))
+                force_evtchn_callback();
+}
+static void xen_safe_halt(void)
+{
+        /* Blocking includes an implicit local_irq_enable(). */
+        if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
+                BUG();
+}
+static void xen_halt(void)
+{
+        if (irqs_disabled())
+                HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+        else
+                xen_safe_halt();
+}
+static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+{
+        BUG_ON(preemptible());
+        switch (mode) {
+        case PARAVIRT_LAZY_NONE:
+                BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
+                break;
+        case PARAVIRT_LAZY_MMU:
+        case PARAVIRT_LAZY_CPU:
+                BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
+                break;
+        case PARAVIRT_LAZY_FLUSH:
+                /* flush if necessary, but don't change state */
+                if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
+                        xen_mc_flush();
+                return;
+        }
+        xen_mc_flush();
+        x86_write_percpu(xen_lazy_mode, mode);
+}
+static unsigned long xen_store_tr(void)
+{
+        return 0;
+}
+static void xen_set_ldt(const void *addr, unsigned entries)
+{
+        unsigned long linear_addr = (unsigned long)addr;
+        struct mmuext_op *op;
+        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_SET_LDT;
+        if (linear_addr) {
+                /* ldt my be vmalloced, use arbitrary_virt_to_machine */
+                xmaddr_t maddr;
+                maddr = arbitrary_virt_to_machine((unsigned long)addr);
+                linear_addr = (unsigned long)maddr.maddr;
+        }
+        op->arg1.linear_addr = linear_addr;
+        op->arg2.nr_ents = entries;
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+        unsigned long *frames;
+        unsigned long va = dtr->address;
+        unsigned int size = dtr->size + 1;
+        unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+        int f;
+        struct multicall_space mcs;
+        /* A GDT can be up to 64k in size, which corresponds to 8192
+           8-byte entries, or 16 4k pages.. */
+        BUG_ON(size > 65536);
+        BUG_ON(va & ~PAGE_MASK);
+        mcs = xen_mc_entry(sizeof(*frames) * pages);
+        frames = mcs.args;
+        for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
+                frames[f] = virt_to_mfn(va);
+                make_lowmem_page_readonly((void *)va);
+        }
+        MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
+        xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+static void load_TLS_descriptor(struct thread_struct *t,
+                                unsigned int cpu, unsigned int i)
+{
+        struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+        xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+        struct multicall_space mc = __xen_mc_entry(0);
+        MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
+}
+static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+        xen_mc_batch();
+        load_TLS_descriptor(t, cpu, 0);
+        load_TLS_descriptor(t, cpu, 1);
+        load_TLS_descriptor(t, cpu, 2);
+        xen_mc_issue(PARAVIRT_LAZY_CPU);
+        /*
+         * XXX sleazy hack: If we're being called in a lazy-cpu zone,
+         * it means we're in a context switch, and %gs has just been
+         * saved.  This means we can zero it out to prevent faults on
+         * exit from the hypervisor if the next process has no %gs.
+         * Either way, it has been saved, and the new value will get
+         * loaded properly.  This will go away as soon as Xen has been
+         * modified to not save/restore %gs for normal hypercalls.
+         */
+        if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+                loadsegment(gs, 0);
+}
+static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
+                                u32 low, u32 high)
+{
+        unsigned long lp = (unsigned long)&dt[entrynum];
+        xmaddr_t mach_lp = virt_to_machine(lp);
+        u64 entry = (u64)high << 32 | low;
+        preempt_disable();
+        xen_mc_flush();
+        if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
+                BUG();
+        preempt_enable();
+}
+static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+                            struct trap_info *info)
+{
+        u8 type, dpl;
+        type = (high >> 8) & 0x1f;
+        dpl = (high >> 13) & 3;
+        if (type != 0xf && type != 0xe)
+                return 0;
+        info->vector = vector;
+        info->address = (high & 0xffff0000) | (low & 0x0000ffff);
+        info->cs = low >> 16;
+        info->flags = dpl;
+        /* interrupt gates clear IF */
+        if (type == 0xe)
+                info->flags |= 4;
+        return 1;
+}
+/* Locations of each CPU's IDT */
+static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
+/* Set an IDT entry.  If the entry is part of the current IDT, then
+   also update Xen. */
+static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
+                                u32 low, u32 high)
+{
+        unsigned long p = (unsigned long)&dt[entrynum];
+        unsigned long start, end;
+        preempt_disable();
+        start = __get_cpu_var(idt_desc).address;
+        end = start + __get_cpu_var(idt_desc).size + 1;
+        xen_mc_flush();
+        write_dt_entry(dt, entrynum, low, high);
+        if (p >= start && (p + 8) <= end) {
+                struct trap_info info[2];
+                info[1].address = 0;
+                if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
+                        if (HYPERVISOR_set_trap_table(info))
+                                BUG();
+        }
+        preempt_enable();
+}
+static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
+                                  struct trap_info *traps)
+{
+        unsigned in, out, count;
+        count = (desc->size+1) / 8;
+        BUG_ON(count > 256);
+        for (in = out = 0; in < count; in++) {
+                const u32 *entry = (u32 *)(desc->address + in * 8);
+                if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+                        out++;
+        }
+        traps[out].address = 0;
+}
+void xen_copy_trap_info(struct trap_info *traps)
+{
+        const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
+        xen_convert_trap_info(desc, traps);
+}
+/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
+   hold a spinlock to protect the static traps[] array (static because
+   it avoids allocation, and saves stack space). */
+static void xen_load_idt(const struct Xgt_desc_struct *desc)
+{
+        static DEFINE_SPINLOCK(lock);
+        static struct trap_info traps[257];
+        spin_lock(&lock);
+        __get_cpu_var(idt_desc) = *desc;
+        xen_convert_trap_info(desc, traps);
+        xen_mc_flush();
+        if (HYPERVISOR_set_trap_table(traps))
+                BUG();
+        spin_unlock(&lock);
+}
+/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
+   they're handled differently. */
+static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
+                                u32 low, u32 high)
+{
+        preempt_disable();
+        switch ((high >> 8) & 0xff) {
+        case DESCTYPE_LDT:
+        case DESCTYPE_TSS:
+                /* ignore */
+                break;
+        default: {
+                xmaddr_t maddr = virt_to_machine(&dt[entry]);
+                u64 desc = (u64)high << 32 | low;
+                xen_mc_flush();
+                if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
+                        BUG();
+        }
+        }
+        preempt_enable();
+}
+static void xen_load_esp0(struct tss_struct *tss,
+                          struct thread_struct *thread)
+{
+        struct multicall_space mcs = xen_mc_entry(0);
+        MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
+        xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+static void xen_set_iopl_mask(unsigned mask)
+{
+        struct physdev_set_iopl set_iopl;
+        /* Force the change at ring 0. */
+        set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+        HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+}
+static void xen_io_delay(void)
+{
+}
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long xen_apic_read(unsigned long reg)
+{
+        return 0;
+}
+static void xen_apic_write(unsigned long reg, unsigned long val)
+{
+        /* Warn to see if there's any stray references */
+        WARN_ON(1);
+}
+#endif
+static void xen_flush_tlb(void)
+{
+        struct mmuext_op *op;
+        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+static void xen_flush_tlb_single(unsigned long addr)
+{
+        struct mmuext_op *op;
+        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_INVLPG_LOCAL;
+        op->arg1.linear_addr = addr & PAGE_MASK;
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
+                                 unsigned long va)
+{
+        struct {
+                struct mmuext_op op;
+                cpumask_t mask;
+        } *args;
+        cpumask_t cpumask = *cpus;
+        struct multicall_space mcs;
+        /*
+         * A couple of (to be removed) sanity checks:
+         *
+         * - current CPU must not be in mask
+         * - mask must exist :)
+         */
+        BUG_ON(cpus_empty(cpumask));
+        BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+        BUG_ON(!mm);
+        /* If a CPU which we ran on has gone down, OK. */
+        cpus_and(cpumask, cpumask, cpu_online_map);
+        if (cpus_empty(cpumask))
+                return;
+        mcs = xen_mc_entry(sizeof(*args));
+        args = mcs.args;
+        args->mask = cpumask;
+        args->op.arg2.vcpumask = &args->mask;
+        if (va == TLB_FLUSH_ALL) {
+                args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+        } else {
+                args->op.cmd = MMUEXT_INVLPG_MULTI;
+                args->op.arg1.linear_addr = va;
+        }
+        MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+static void xen_write_cr2(unsigned long cr2)
+{
+        x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+}
+static unsigned long xen_read_cr2(void)
+{
+        return x86_read_percpu(xen_vcpu)->arch.cr2;
+}
+static unsigned long xen_read_cr2_direct(void)
+{
+        return x86_read_percpu(xen_vcpu_info.arch.cr2);
+}
+static void xen_write_cr4(unsigned long cr4)
+{
+        /* Just ignore cr4 changes; Xen doesn't allow us to do
+           anything anyway. */
+}
+static unsigned long xen_read_cr3(void)
+{
+        return x86_read_percpu(xen_cr3);
+}
+static void xen_write_cr3(unsigned long cr3)
+{
+        BUG_ON(preemptible());
+        if (cr3 == x86_read_percpu(xen_cr3)) {
+                /* just a simple tlb flush */
+                xen_flush_tlb();
+                return;
+        }
+        x86_write_percpu(xen_cr3, cr3);
+        {
+                struct mmuext_op *op;
+                struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+                unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+                op = mcs.args;
+                op->cmd = MMUEXT_NEW_BASEPTR;
+                op->arg1.mfn = mfn;
+                MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+                xen_mc_issue(PARAVIRT_LAZY_CPU);
+        }
+}
+/* Early in boot, while setting up the initial pagetable, assume
+   everything is pinned. */
+static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
+{
+        BUG_ON(mem_map);        /* should only be used early */
+        make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+/* This needs to make sure the new pte page is pinned iff its being
+   attached to a pinned pagetable. */
+static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+{
+        struct page *page = pfn_to_page(pfn);
+        if (PagePinned(virt_to_page(mm->pgd))) {
+                SetPagePinned(page);
+                if (!PageHighMem(page))
+                        make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+                else
+                        /* make sure there are no stray mappings of
+                           this page */
+                        kmap_flush_unused();
+        }
+}
+/* This should never happen until we're OK to use struct page */
+static void xen_release_pt(u32 pfn)
+{
+        struct page *page = pfn_to_page(pfn);
+        if (PagePinned(page)) {
+                if (!PageHighMem(page))
+                        make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+        }
+}
+#ifdef CONFIG_HIGHPTE
+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+        pgprot_t prot = PAGE_KERNEL;
+        if (PagePinned(page))
+                prot = PAGE_KERNEL_RO;
+        if (0 && PageHighMem(page))
+                printk("mapping highpte %lx type %d prot %s\n",
+                       page_to_pfn(page), type,
+                       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+        return kmap_atomic_prot(page, type, prot);
+}
+#endif
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+        /* If there's an existing pte, then don't allow _PAGE_RW to be set */
+        if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+                pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+                               pte_val_ma(pte));
+        return pte;
+}
+/* Init-time set_pte while constructing initial pagetables, which
+   doesn't allow RO pagetable pages to be remapped RW */
+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+        pte = mask_rw_pte(ptep, pte);
+        xen_set_pte(ptep, pte);
+}
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+        pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+        /* special set_pte for pagetable initialization */
+        paravirt_ops.set_pte = xen_set_pte_init;
+        init_mm.pgd = base;
+        /*
+         * copy top-level of Xen-supplied pagetable into place.  For
+         * !PAE we can use this as-is, but for PAE it is a stand-in
+         * while we copy the pmd pages.
+         */
+        memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
+        if (PTRS_PER_PMD > 1) {
+                int i;
+                /*
+                 * For PAE, need to allocate new pmds, rather than
+                 * share Xen's, since Xen doesn't like pmd's being
+                 * shared between address spaces.
+                 */
+                for (i = 0; i < PTRS_PER_PGD; i++) {
+                        if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+                                pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+                                memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+                                       PAGE_SIZE);
+                                make_lowmem_page_readonly(pmd);
+                                set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+                        } else
+                                pgd_clear(&base[i]);
+                }
+        }
+        /* make sure zero_page is mapped RO so we can use it in pagetables */
+        make_lowmem_page_readonly(empty_zero_page);
+        make_lowmem_page_readonly(base);
+        /*
+         * Switch to new pagetable.  This is done before
+         * pagetable_init has done anything so that the new pages
+         * added to the table can be prepared properly for Xen.
+         */
+        xen_write_cr3(__pa(base));
+}
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+        /* This will work as long as patching hasn't happened yet
+           (which it hasn't) */
+        paravirt_ops.alloc_pt = xen_alloc_pt;
+        paravirt_ops.set_pte = xen_set_pte;
+        if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                /*
+                 * Create a mapping for the shared info page.
+                 * Should be set_fixmap(), but shared_info is a machine
+                 * address with no corresponding pseudo-phys address.
+                 */
+                set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+                            PFN_DOWN(xen_start_info->shared_info),
+                            PAGE_KERNEL);
+                HYPERVISOR_shared_info =
+                        (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+        } else
+                HYPERVISOR_shared_info =
+                        (struct shared_info *)__va(xen_start_info->shared_info);
+        /* Actually pin the pagetable down, but we can't set PG_pinned
+           yet because the page structures don't exist yet. */
+        {
+                struct mmuext_op op;
+#ifdef CONFIG_X86_PAE
+                op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+                op.cmd = MMUEXT_PIN_L3_TABLE;
+#endif
+                op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
+                if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+                        BUG();
+        }
+}
+/* This is called once we have the cpu_possible_map */
+void __init xen_setup_vcpu_info_placement(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                xen_vcpu_setup(cpu);
+        /* xen_vcpu_setup managed to place the vcpu_info within the
+           percpu area for all cpus, so make use of it */
+        if (have_vcpu_info_placement) {
+                printk(KERN_INFO "Xen: using vcpu_info placement\n");
+                paravirt_ops.save_fl = xen_save_fl_direct;
+                paravirt_ops.restore_fl = xen_restore_fl_direct;
+                paravirt_ops.irq_disable = xen_irq_disable_direct;
+                paravirt_ops.irq_enable = xen_irq_enable_direct;
+                paravirt_ops.read_cr2 = xen_read_cr2_direct;
+                paravirt_ops.iret = xen_iret_direct;
+        }
+}
+static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
+                          unsigned long addr, unsigned len)
+{
+        char *start, *end, *reloc;
+        unsigned ret;
+        start = end = reloc = NULL;
+#define SITE(x)                                                         \
+        case PARAVIRT_PATCH(x):                                         \
+        if (have_vcpu_info_placement) {                                 \
+                start = (char *)xen_##x##_direct;                       \
+                end = xen_##x##_direct_end;                             \
+                reloc = xen_##x##_direct_reloc;                         \
+        }                                                               \
+        goto patch_site
+        switch (type) {
+                SITE(irq_enable);
+                SITE(irq_disable);
+                SITE(save_fl);
+                SITE(restore_fl);
+#undef SITE
+        patch_site:
+                if (start == NULL || (end-start) > len)
+                        goto default_patch;
+                ret = paravirt_patch_insns(insnbuf, len, start, end);
+                /* Note: because reloc is assigned from something that
+                   appears to be an array, gcc assumes it's non-null,
+                   but doesn't know its relationship with start and
+                   end. */
+                if (reloc > start && reloc < end) {
+                        int reloc_off = reloc - start;
+                        long *relocp = (long *)(insnbuf + reloc_off);
+                        long delta = start - (char *)addr;
+                        *relocp += delta;
+                }
+                break;
+        default_patch:
+        default:
+                ret = paravirt_patch_default(type, clobbers, insnbuf,
+                                             addr, len);
+                break;
+        }
+        return ret;
+}
+static const struct paravirt_ops xen_paravirt_ops __initdata = {
+        .paravirt_enabled = 1,
+        .shared_kernel_pmd = 0,
+        .name = "Xen",
+        .banner = xen_banner,
+        .patch = xen_patch,
+        .memory_setup = xen_memory_setup,
+        .arch_setup = xen_arch_setup,
+        .init_IRQ = xen_init_IRQ,
+        .post_allocator_init = xen_mark_init_mm_pinned,
+        .time_init = xen_time_init,
+        .set_wallclock = xen_set_wallclock,
+        .get_wallclock = xen_get_wallclock,
+        .get_cpu_khz = xen_cpu_khz,
+        .sched_clock = xen_sched_clock,
+        .cpuid = xen_cpuid,
+        .set_debugreg = xen_set_debugreg,
+        .get_debugreg = xen_get_debugreg,
+        .clts = native_clts,
+        .read_cr0 = native_read_cr0,
+        .write_cr0 = native_write_cr0,
+        .read_cr2 = xen_read_cr2,
+        .write_cr2 = xen_write_cr2,
+        .read_cr3 = xen_read_cr3,
+        .write_cr3 = xen_write_cr3,
+        .read_cr4 = native_read_cr4,
+        .read_cr4_safe = native_read_cr4_safe,
+        .write_cr4 = xen_write_cr4,
+        .save_fl = xen_save_fl,
+        .restore_fl = xen_restore_fl,
+        .irq_disable = xen_irq_disable,
+        .irq_enable = xen_irq_enable,
+        .safe_halt = xen_safe_halt,
+        .halt = xen_halt,
+        .wbinvd = native_wbinvd,
+        .read_msr = native_read_msr_safe,
+        .write_msr = native_write_msr_safe,
+        .read_tsc = native_read_tsc,
+        .read_pmc = native_read_pmc,
+        .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
+        .irq_enable_sysexit = NULL,  /* never called */
+        .load_tr_desc = paravirt_nop,
+        .set_ldt = xen_set_ldt,
+        .load_gdt = xen_load_gdt,
+        .load_idt = xen_load_idt,
+        .load_tls = xen_load_tls,
+        .store_gdt = native_store_gdt,
+        .store_idt = native_store_idt,
+        .store_tr = xen_store_tr,
+        .write_ldt_entry = xen_write_ldt_entry,
+        .write_gdt_entry = xen_write_gdt_entry,
+        .write_idt_entry = xen_write_idt_entry,
+        .load_esp0 = xen_load_esp0,
+        .set_iopl_mask = xen_set_iopl_mask,
+        .io_delay = xen_io_delay,
+#ifdef CONFIG_X86_LOCAL_APIC
+        .apic_write = xen_apic_write,
+        .apic_write_atomic = xen_apic_write,
+        .apic_read = xen_apic_read,
+        .setup_boot_clock = paravirt_nop,
+        .setup_secondary_clock = paravirt_nop,
+        .startup_ipi_hook = paravirt_nop,
+#endif
+        .flush_tlb_user = xen_flush_tlb,
+        .flush_tlb_kernel = xen_flush_tlb,
+        .flush_tlb_single = xen_flush_tlb_single,
+        .flush_tlb_others = xen_flush_tlb_others,
+        .pte_update = paravirt_nop,
+        .pte_update_defer = paravirt_nop,
+        .pagetable_setup_start = xen_pagetable_setup_start,
+        .pagetable_setup_done = xen_pagetable_setup_done,
+        .alloc_pt = xen_alloc_pt_init,
+        .release_pt = xen_release_pt,
+        .alloc_pd = paravirt_nop,
+        .alloc_pd_clone = paravirt_nop,
+        .release_pd = paravirt_nop,
+#ifdef CONFIG_HIGHPTE
+        .kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
+        .set_pte = NULL,        /* see xen_pagetable_setup_* */
+        .set_pte_at = xen_set_pte_at,
+        .set_pmd = xen_set_pmd,
+        .pte_val = xen_pte_val,
+        .pgd_val = xen_pgd_val,
+        .make_pte = xen_make_pte,
+        .make_pgd = xen_make_pgd,
+#ifdef CONFIG_X86_PAE
+        .set_pte_atomic = xen_set_pte_atomic,
+        .set_pte_present = xen_set_pte_at,
+        .set_pud = xen_set_pud,
+        .pte_clear = xen_pte_clear,
+        .pmd_clear = xen_pmd_clear,
+        .make_pmd = xen_make_pmd,
+        .pmd_val = xen_pmd_val,
+#endif  /* PAE */
+        .activate_mm = xen_activate_mm,
+        .dup_mmap = xen_dup_mmap,
+        .exit_mmap = xen_exit_mmap,
+        .set_lazy_mode = xen_set_lazy_mode,
+};
+#ifdef CONFIG_SMP
+static const struct smp_ops xen_smp_ops __initdata = {
+        .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+        .smp_prepare_cpus = xen_smp_prepare_cpus,
+        .cpu_up = xen_cpu_up,
+        .smp_cpus_done = xen_smp_cpus_done,
+        .smp_send_stop = xen_smp_send_stop,
+        .smp_send_reschedule = xen_smp_send_reschedule,
+        .smp_call_function_mask = xen_smp_call_function_mask,
+};
+#endif  /* CONFIG_SMP */
+static void xen_reboot(int reason)
+{
+#ifdef CONFIG_SMP
+        smp_send_stop();
+#endif
+        if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
+                BUG();
+}
+static void xen_restart(char *msg)
+{
+        xen_reboot(SHUTDOWN_reboot);
+}
+static void xen_emergency_restart(void)
+{
+        xen_reboot(SHUTDOWN_reboot);
+}
+static void xen_machine_halt(void)
+{
+        xen_reboot(SHUTDOWN_poweroff);
+}
+static void xen_crash_shutdown(struct pt_regs *regs)
+{
+        xen_reboot(SHUTDOWN_crash);
+}
+static const struct machine_ops __initdata xen_machine_ops = {
+        .restart = xen_restart,
+        .halt = xen_machine_halt,
+        .power_off = xen_machine_halt,
+        .shutdown = xen_machine_halt,
+        .crash_shutdown = xen_crash_shutdown,
+        .emergency_restart = xen_emergency_restart,
+};
+/* First C function to be called on Xen boot */
+asmlinkage void __init xen_start_kernel(void)
+{
+        pgd_t *pgd;
+        if (!xen_start_info)
+                return;
+        BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
+        /* Install Xen paravirt ops */
+        paravirt_ops = xen_paravirt_ops;
+        machine_ops = xen_machine_ops;
+#ifdef CONFIG_SMP
+        smp_ops = xen_smp_ops;
+#endif
+        xen_setup_features();
+        /* Get mfn list */
+        if (!xen_feature(XENFEAT_auto_translated_physmap))
+                phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
+        pgd = (pgd_t *)xen_start_info->pt_base;
+        init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+        init_mm.pgd = pgd; /* use the Xen pagetables to start */
+        /* keep using Xen gdt for now; no urgent need to change it */
+        x86_write_percpu(xen_cr3, __pa(pgd));
+#ifdef CONFIG_SMP
+        /* Don't do the full vcpu_info placement stuff until we have a
+           possible map. */
+        per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+#else
+        /* May as well do it now, since there's no good time to call
+           it later on UP. */
+        xen_setup_vcpu_info_placement();
+#endif
+        paravirt_ops.kernel_rpl = 1;
+        if (xen_feature(XENFEAT_supervisor_mode_kernel))
+                paravirt_ops.kernel_rpl = 0;
+        /* set the limit of our address space */
+        reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+        /* set up basic CPUID stuff */
+        cpu_detect(&new_cpu_data);
+        new_cpu_data.hard_math = 1;
+        new_cpu_data.x86_capability[0] = cpuid_edx(1);
+        /* Poke various useful things into boot_params */
+        LOADER_TYPE = (9 << 4) | 0;
+        INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
+        INITRD_SIZE = xen_start_info->mod_len;
+        /* Start the world */
+        start_kernel();
+}
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
new file mode 100644
index 000000000000..da1b173547a1
--- /dev/null
+++ b/arch/x86/xen/events.c
@@ -0,0 +1,591 @@
+/*
+ * Xen event channels
+ *
+ * Xen models interrupts with abstract event channels.  Because each
+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
+ * must dynamically map irqs<->event channels.  The event channels
+ * interface with the rest of the kernel by defining a xen interrupt
+ * chip.  When an event is recieved, it is mapped to an irq and sent
+ * through the normal interrupt processing path.
+ *
+ * There are four kinds of events which can be mapped to an event
+ * channel:
+ *
+ * 1. Inter-domain notifications.  This includes all the virtual
+ *    device events, since they're driven by front-ends in another domain
+ *    (typically dom0).
+ * 2. VIRQs, typically used for timers.  These are per-cpu events.
+ * 3. IPIs.
+ * 4. Hardware interrupts. Not supported at present.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include "xen-ops.h"
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+/* IRQ <-> VIRQ mapping. */
+static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+/* IRQ <-> IPI mapping */
+static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+struct packed_irq
+{
+        unsigned short evtchn;
+        unsigned char index;
+        unsigned char type;
+};
+static struct packed_irq irq_info[NR_IRQS];
+/* Binding types. */
+enum {
+        IRQT_UNBOUND,
+        IRQT_PIRQ,
+        IRQT_VIRQ,
+        IRQT_IPI,
+        IRQT_EVTCHN
+};
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND     mk_irq_info(IRQT_UNBOUND, 0, 0)
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+        [0 ... NR_EVENT_CHANNELS-1] = -1
+};
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn)       ((chn) != 0)
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+        (void)HYPERVISOR_xen_version(0, NULL);
+}
+EXPORT_SYMBOL_GPL(force_evtchn_callback);
+static struct irq_chip xen_dynamic_chip;
+/* Constructor for packed IRQ information. */
+static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+        return (struct packed_irq) { evtchn, index, type };
+}
+/*
+ * Accessors for packed IRQ information.
+ */
+static inline unsigned int evtchn_from_irq(int irq)
+{
+        return irq_info[irq].evtchn;
+}
+static inline unsigned int index_from_irq(int irq)
+{
+        return irq_info[irq].index;
+}
+static inline unsigned int type_from_irq(int irq)
+{
+        return irq_info[irq].type;
+}
+static inline unsigned long active_evtchns(unsigned int cpu,
+                                           struct shared_info *sh,
+                                           unsigned int idx)
+{
+        return (sh->evtchn_pending[idx] &
+                cpu_evtchn_mask[cpu][idx] &
+                ~sh->evtchn_mask[idx]);
+}
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+        int irq = evtchn_to_irq[chn];
+        BUG_ON(irq == -1);
+#ifdef CONFIG_SMP
+        irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+#endif
+        __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
+        __set_bit(chn, cpu_evtchn_mask[cpu]);
+        cpu_evtchn[chn] = cpu;
+}
+static void init_evtchn_cpu_bindings(void)
+{
+#ifdef CONFIG_SMP
+        int i;
+        /* By default all event channels notify CPU#0. */
+        for (i = 0; i < NR_IRQS; i++)
+                irq_desc[i].affinity = cpumask_of_cpu(0);
+#endif
+        memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+        memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+        return cpu_evtchn[evtchn];
+}
+static inline void clear_evtchn(int port)
+{
+        struct shared_info *s = HYPERVISOR_shared_info;
+        sync_clear_bit(port, &s->evtchn_pending[0]);
+}
+static inline void set_evtchn(int port)
+{
+        struct shared_info *s = HYPERVISOR_shared_info;
+        sync_set_bit(port, &s->evtchn_pending[0]);
+}
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        if (VALID_EVTCHN(evtchn))
+                notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+static void mask_evtchn(int port)
+{
+        struct shared_info *s = HYPERVISOR_shared_info;
+        sync_set_bit(port, &s->evtchn_mask[0]);
+}
+static void unmask_evtchn(int port)
+{
+        struct shared_info *s = HYPERVISOR_shared_info;
+        unsigned int cpu = get_cpu();
+        BUG_ON(!irqs_disabled());
+        /* Slow path (hypercall) if this is a non-local port. */
+        if (unlikely(cpu != cpu_from_evtchn(port))) {
+                struct evtchn_unmask unmask = { .port = port };
+                (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+        } else {
+                struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+                sync_clear_bit(port, &s->evtchn_mask[0]);
+                /*
+                 * The following is basically the equivalent of
+                 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+                 * the interrupt edge' if the channel is masked.
+                 */
+                if (sync_test_bit(port, &s->evtchn_pending[0]) &&
+                    !sync_test_and_set_bit(port / BITS_PER_LONG,
+                                           &vcpu_info->evtchn_pending_sel))
+                        vcpu_info->evtchn_upcall_pending = 1;
+        }
+        put_cpu();
+}
+static int find_unbound_irq(void)
+{
+        int irq;
+        /* Only allocate from dynirq range */
+        for (irq = 0; irq < NR_IRQS; irq++)
+                if (irq_bindcount[irq] == 0)
+                        break;
+        if (irq == NR_IRQS)
+                panic("No available IRQ to bind to: increase NR_IRQS!\n");
+        return irq;
+}
+int bind_evtchn_to_irq(unsigned int evtchn)
+{
+        int irq;
+        spin_lock(&irq_mapping_update_lock);
+        irq = evtchn_to_irq[evtchn];
+        if (irq == -1) {
+                irq = find_unbound_irq();
+                dynamic_irq_init(irq);
+                set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+                                              handle_level_irq, "event");
+                evtchn_to_irq[evtchn] = irq;
+                irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
+        }
+        irq_bindcount[irq]++;
+        spin_unlock(&irq_mapping_update_lock);
+        return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+        struct evtchn_bind_ipi bind_ipi;
+        int evtchn, irq;
+        spin_lock(&irq_mapping_update_lock);
+        irq = per_cpu(ipi_to_irq, cpu)[ipi];
+        if (irq == -1) {
+                irq = find_unbound_irq();
+                if (irq < 0)
+                        goto out;
+                dynamic_irq_init(irq);
+                set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+                                              handle_level_irq, "ipi");
+                bind_ipi.vcpu = cpu;
+                if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+                                                &bind_ipi) != 0)
+                        BUG();
+                evtchn = bind_ipi.port;
+                evtchn_to_irq[evtchn] = irq;
+                irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+                per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+                bind_evtchn_to_cpu(evtchn, cpu);
+        }
+        irq_bindcount[irq]++;
+ out:
+        spin_unlock(&irq_mapping_update_lock);
+        return irq;
+}
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+        struct evtchn_bind_virq bind_virq;
+        int evtchn, irq;
+        spin_lock(&irq_mapping_update_lock);
+        irq = per_cpu(virq_to_irq, cpu)[virq];
+        if (irq == -1) {
+                bind_virq.virq = virq;
+                bind_virq.vcpu = cpu;
+                if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+                                                &bind_virq) != 0)
+                        BUG();
+                evtchn = bind_virq.port;
+                irq = find_unbound_irq();
+                dynamic_irq_init(irq);
+                set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+                                              handle_level_irq, "virq");
+                evtchn_to_irq[evtchn] = irq;
+                irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+                per_cpu(virq_to_irq, cpu)[virq] = irq;
+                bind_evtchn_to_cpu(evtchn, cpu);
+        }
+        irq_bindcount[irq]++;
+        spin_unlock(&irq_mapping_update_lock);
+        return irq;
+}
+static void unbind_from_irq(unsigned int irq)
+{
+        struct evtchn_close close;
+        int evtchn = evtchn_from_irq(irq);
+        spin_lock(&irq_mapping_update_lock);
+        if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
+                close.port = evtchn;
+                if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+                        BUG();
+                switch (type_from_irq(irq)) {
+                case IRQT_VIRQ:
+                        per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+                                [index_from_irq(irq)] = -1;
+                        break;
+                default:
+                        break;
+                }
+                /* Closed ports are implicitly re-bound to VCPU0. */
+                bind_evtchn_to_cpu(evtchn, 0);
+                evtchn_to_irq[evtchn] = -1;
+                irq_info[irq] = IRQ_UNBOUND;
+                dynamic_irq_init(irq);
+        }
+        spin_unlock(&irq_mapping_update_lock);
+}
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+                              irqreturn_t (*handler)(int, void *),
+                              unsigned long irqflags,
+                              const char *devname, void *dev_id)
+{
+        unsigned int irq;
+        int retval;
+        irq = bind_evtchn_to_irq(evtchn);
+        retval = request_irq(irq, handler, irqflags, devname, dev_id);
+        if (retval != 0) {
+                unbind_from_irq(irq);
+                return retval;
+        }
+        return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+                            irqreturn_t (*handler)(int, void *),
+                            unsigned long irqflags, const char *devname, void *dev_id)
+{
+        unsigned int irq;
+        int retval;
+        irq = bind_virq_to_irq(virq, cpu);
+        retval = request_irq(irq, handler, irqflags, devname, dev_id);
+        if (retval != 0) {
+                unbind_from_irq(irq);
+                return retval;
+        }
+        return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+                           unsigned int cpu,
+                           irq_handler_t handler,
+                           unsigned long irqflags,
+                           const char *devname,
+                           void *dev_id)
+{
+        int irq, retval;
+        irq = bind_ipi_to_irq(ipi, cpu);
+        if (irq < 0)
+                return irq;
+        retval = request_irq(irq, handler, irqflags, devname, dev_id);
+        if (retval != 0) {
+                unbind_from_irq(irq);
+                return retval;
+        }
+        return irq;
+}
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+        free_irq(irq, dev_id);
+        unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
+{
+        int irq = per_cpu(ipi_to_irq, cpu)[vector];
+        BUG_ON(irq < 0);
+        notify_remote_via_irq(irq);
+}
+/*
+ * Search the CPUs pending events bitmasks.  For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for
+ * handling.
+ *
+ * Xen uses a two-level bitmap to speed searching.  The first level is
+ * a bitset of words which contain pending event bits.  The second
+ * level is a bitset of pending events themselves.
+ */
+fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+        int cpu = get_cpu();
+        struct shared_info *s = HYPERVISOR_shared_info;
+        struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+        unsigned long pending_words;
+        vcpu_info->evtchn_upcall_pending = 0;
+        /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+        pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+        while (pending_words != 0) {
+                unsigned long pending_bits;
+                int word_idx = __ffs(pending_words);
+                pending_words &= ~(1UL << word_idx);
+                while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+                        int bit_idx = __ffs(pending_bits);
+                        int port = (word_idx * BITS_PER_LONG) + bit_idx;
+                        int irq = evtchn_to_irq[port];
+                        if (irq != -1) {
+                                regs->orig_eax = ~irq;
+                                do_IRQ(regs);
+                        }
+                }
+        }
+        put_cpu();
+}
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+        struct evtchn_bind_vcpu bind_vcpu;
+        int evtchn = evtchn_from_irq(irq);
+        if (!VALID_EVTCHN(evtchn))
+                return;
+        /* Send future instances of this interrupt to other vcpu. */
+        bind_vcpu.port = evtchn;
+        bind_vcpu.vcpu = tcpu;
+        /*
+         * If this fails, it usually just indicates that we're dealing with a
+         * virq or IPI channel, which don't actually need to be rebound. Ignore
+         * it, but don't do the xenlinux-level rebind in that case.
+         */
+        if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+                bind_evtchn_to_cpu(evtchn, tcpu);
+}
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+        unsigned tcpu = first_cpu(dest);
+        rebind_irq_to_cpu(irq, tcpu);
+}
+static void enable_dynirq(unsigned int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        if (VALID_EVTCHN(evtchn))
+                unmask_evtchn(evtchn);
+}
+static void disable_dynirq(unsigned int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        if (VALID_EVTCHN(evtchn))
+                mask_evtchn(evtchn);
+}
+static void ack_dynirq(unsigned int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        move_native_irq(irq);
+        if (VALID_EVTCHN(evtchn))
+                clear_evtchn(evtchn);
+}
+static int retrigger_dynirq(unsigned int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        int ret = 0;
+        if (VALID_EVTCHN(evtchn)) {
+                set_evtchn(evtchn);
+                ret = 1;
+        }
+        return ret;
+}
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+        .name           = "xen-dyn",
+        .mask           = disable_dynirq,
+        .unmask         = enable_dynirq,
+        .ack            = ack_dynirq,
+        .set_affinity   = set_affinity_irq,
+        .retrigger      = retrigger_dynirq,
+};
+void __init xen_init_IRQ(void)
+{
+        int i;
+        init_evtchn_cpu_bindings();
+        /* No event channels are 'live' right now. */
+        for (i = 0; i < NR_EVENT_CHANNELS; i++)
+                mask_evtchn(i);
+        /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+        for (i = 0; i < NR_IRQS; i++)
+                irq_bindcount[i] = 0;
+        irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c
new file mode 100644
index 000000000000..0707714e40d6
--- /dev/null
+++ b/arch/x86/xen/features.c
@@ -0,0 +1,29 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/features.h>
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+void xen_setup_features(void)
+{
+        struct xen_feature_info fi;
+        int i, j;
+        for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+                fi.submap_idx = i;
+                if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+                        break;
+                for (j = 0; j < 32; j++)
+                        xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
+        }
+}
diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c
new file mode 100644
index 000000000000..aa7af9e6abc0
--- /dev/null
+++ b/arch/x86/xen/manage.c
@@ -0,0 +1,143 @@
+/*
+ * Handle extern requests for shutdown, reboot and sysrq
+ */
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+#include <xen/xenbus.h>
+#define SHUTDOWN_INVALID  -1
+#define SHUTDOWN_POWEROFF  0
+#define SHUTDOWN_SUSPEND   2
+/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
+ * report a crash, not be instructed to crash!
+ * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
+ * the distinction when we return the reason code to them.
+ */
+#define SHUTDOWN_HALT      4
+/* Ignore multiple shutdown requests. */
+static int shutting_down = SHUTDOWN_INVALID;
+static void shutdown_handler(struct xenbus_watch *watch,
+                             const char **vec, unsigned int len)
+{
+        char *str;
+        struct xenbus_transaction xbt;
+        int err;
+        if (shutting_down != SHUTDOWN_INVALID)
+                return;
+ again:
+        err = xenbus_transaction_start(&xbt);
+        if (err)
+                return;
+        str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+        /* Ignore read errors and empty reads. */
+        if (XENBUS_IS_ERR_READ(str)) {
+                xenbus_transaction_end(xbt, 1);
+                return;
+        }
+        xenbus_write(xbt, "control", "shutdown", "");
+        err = xenbus_transaction_end(xbt, 0);
+        if (err == -EAGAIN) {
+                kfree(str);
+                goto again;
+        }
+        if (strcmp(str, "poweroff") == 0 ||
+            strcmp(str, "halt") == 0)
+                orderly_poweroff(false);
+        else if (strcmp(str, "reboot") == 0)
+                ctrl_alt_del();
+        else {
+                printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
+                shutting_down = SHUTDOWN_INVALID;
+        }
+        kfree(str);
+}
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+                          unsigned int len)
+{
+        char sysrq_key = '\0';
+        struct xenbus_transaction xbt;
+        int err;
+ again:
+        err = xenbus_transaction_start(&xbt);
+        if (err)
+                return;
+        if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
+                printk(KERN_ERR "Unable to read sysrq code in "
+                       "control/sysrq\n");
+                xenbus_transaction_end(xbt, 1);
+                return;
+        }
+        if (sysrq_key != '\0')
+                xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+        err = xenbus_transaction_end(xbt, 0);
+        if (err == -EAGAIN)
+                goto again;
+        if (sysrq_key != '\0')
+                handle_sysrq(sysrq_key, NULL);
+}
+static struct xenbus_watch shutdown_watch = {
+        .node = "control/shutdown",
+        .callback = shutdown_handler
+};
+static struct xenbus_watch sysrq_watch = {
+        .node = "control/sysrq",
+        .callback = sysrq_handler
+};
+static int setup_shutdown_watcher(void)
+{
+        int err;
+        err = register_xenbus_watch(&shutdown_watch);
+        if (err) {
+                printk(KERN_ERR "Failed to set shutdown watcher\n");
+                return err;
+        }
+        err = register_xenbus_watch(&sysrq_watch);
+        if (err) {
+                printk(KERN_ERR "Failed to set sysrq watcher\n");
+                return err;
+        }
+        return 0;
+}
+static int shutdown_event(struct notifier_block *notifier,
+                          unsigned long event,
+                          void *data)
+{
+        setup_shutdown_watcher();
+        return NOTIFY_DONE;
+}
+static int __init setup_shutdown_event(void)
+{
+        static struct notifier_block xenstore_notifier = {
+                .notifier_call = shutdown_event
+        };
+        register_xenstore_notifier(&xenstore_notifier);
+        return 0;
+}
+subsys_initcall(setup_shutdown_event);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
new file mode 100644
index 000000000000..874db0cd1d2a
--- /dev/null
+++ b/arch/x86/xen/mmu.c
@@ -0,0 +1,567 @@
+/*
+ * Xen mmu operations
+ *
+ * This file contains the various mmu fetch and update operations.
+ * The most important job they must perform is the mapping between the
+ * domain's pfn and the overall machine mfns.
+ *
+ * Xen allows guests to directly update the pagetable, in a controlled
+ * fashion.  In other words, the guest modifies the same pagetable
+ * that the CPU actually uses, which eliminates the overhead of having
+ * a separate shadow pagetable.
+ *
+ * In order to allow this, it falls on the guest domain to map its
+ * notion of a "physical" pfn - which is just a domain-local linear
+ * address - into a real "machine address" which the CPU's MMU can
+ * use.
+ *
+ * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
+ * inserted directly into the pagetable.  When creating a new
+ * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
+ * when reading the content back with __(pgd|pmd|pte)_val, it converts
+ * the mfn back into a pfn.
+ *
+ * The other constraint is that all pages which make up a pagetable
+ * must be mapped read-only in the guest.  This prevents uncontrolled
+ * guest updates to the pagetable.  Xen strictly enforces this, and
+ * will disallow any pagetable update which will end up mapping a
+ * pagetable page RW, and will disallow using any writable page as a
+ * pagetable.
+ *
+ * Naively, when loading %cr3 with the base of a new pagetable, Xen
+ * would need to validate the whole pagetable before going on.
+ * Naturally, this is quite slow.  The solution is to "pin" a
+ * pagetable, which enforces all the constraints on the pagetable even
+ * when it is not actively in use.  This menas that Xen can be assured
+ * that it is still valid when you do load it into %cr3, and doesn't
+ * need to revalidate it.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/bug.h>
+#include <linux/sched.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/paravirt.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include "multicalls.h"
+#include "mmu.h"
+xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+{
+        pte_t *pte = lookup_address(address);
+        unsigned offset = address & PAGE_MASK;
+        BUG_ON(pte == NULL);
+        return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+}
+void make_lowmem_page_readonly(void *vaddr)
+{
+        pte_t *pte, ptev;
+        unsigned long address = (unsigned long)vaddr;
+        pte = lookup_address(address);
+        BUG_ON(pte == NULL);
+        ptev = pte_wrprotect(*pte);
+        if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+                BUG();
+}
+void make_lowmem_page_readwrite(void *vaddr)
+{
+        pte_t *pte, ptev;
+        unsigned long address = (unsigned long)vaddr;
+        pte = lookup_address(address);
+        BUG_ON(pte == NULL);
+        ptev = pte_mkwrite(*pte);
+        if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+                BUG();
+}
+void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+        struct multicall_space mcs;
+        struct mmu_update *u;
+        preempt_disable();
+        mcs = xen_mc_entry(sizeof(*u));
+        u = mcs.args;
+        u->ptr = virt_to_machine(ptr).maddr;
+        u->val = pmd_val_ma(val);
+        MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+        preempt_enable();
+}
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        pgd = swapper_pg_dir + pgd_index(vaddr);
+        if (pgd_none(*pgd)) {
+                BUG();
+                return;
+        }
+        pud = pud_offset(pgd, vaddr);
+        if (pud_none(*pud)) {
+                BUG();
+                return;
+        }
+        pmd = pmd_offset(pud, vaddr);
+        if (pmd_none(*pmd)) {
+                BUG();
+                return;
+        }
+        pte = pte_offset_kernel(pmd, vaddr);
+        /* <mfn,flags> stored as-is, to permit clearing entries */
+        xen_set_pte(pte, mfn_pte(mfn, flags));
+        /*
+         * It's enough to flush this one mapping.
+         * (PGE mappings get flushed as well)
+         */
+        __flush_tlb_one(vaddr);
+}
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pteval)
+{
+        if (mm == current->mm || mm == &init_mm) {
+                if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+                        struct multicall_space mcs;
+                        mcs = xen_mc_entry(0);
+                        MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
+                        xen_mc_issue(PARAVIRT_LAZY_MMU);
+                        return;
+                } else
+                        if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
+                                return;
+        }
+        xen_set_pte(ptep, pteval);
+}
+#ifdef CONFIG_X86_PAE
+void xen_set_pud(pud_t *ptr, pud_t val)
+{
+        struct multicall_space mcs;
+        struct mmu_update *u;
+        preempt_disable();
+        mcs = xen_mc_entry(sizeof(*u));
+        u = mcs.args;
+        u->ptr = virt_to_machine(ptr).maddr;
+        u->val = pud_val_ma(val);
+        MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+        preempt_enable();
+}
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+        ptep->pte_high = pte.pte_high;
+        smp_wmb();
+        ptep->pte_low = pte.pte_low;
+}
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+        set_64bit((u64 *)ptep, pte_val_ma(pte));
+}
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+        ptep->pte_low = 0;
+        smp_wmb();              /* make sure low gets written first */
+        ptep->pte_high = 0;
+}
+void xen_pmd_clear(pmd_t *pmdp)
+{
+        xen_set_pmd(pmdp, __pmd(0));
+}
+unsigned long long xen_pte_val(pte_t pte)
+{
+        unsigned long long ret = 0;
+        if (pte.pte_low) {
+                ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        }
+        return ret;
+}
+unsigned long long xen_pmd_val(pmd_t pmd)
+{
+        unsigned long long ret = pmd.pmd;
+        if (ret)
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        return ret;
+}
+unsigned long long xen_pgd_val(pgd_t pgd)
+{
+        unsigned long long ret = pgd.pgd;
+        if (ret)
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        return ret;
+}
+pte_t xen_make_pte(unsigned long long pte)
+{
+        if (pte & 1)
+                pte = phys_to_machine(XPADDR(pte)).maddr;
+        return (pte_t){ pte, pte >> 32 };
+}
+pmd_t xen_make_pmd(unsigned long long pmd)
+{
+        if (pmd & 1)
+                pmd = phys_to_machine(XPADDR(pmd)).maddr;
+        return (pmd_t){ pmd };
+}
+pgd_t xen_make_pgd(unsigned long long pgd)
+{
+        if (pgd & _PAGE_PRESENT)
+                pgd = phys_to_machine(XPADDR(pgd)).maddr;
+        return (pgd_t){ pgd };
+}
+#else  /* !PAE */
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+        *ptep = pte;
+}
+unsigned long xen_pte_val(pte_t pte)
+{
+        unsigned long ret = pte.pte_low;
+        if (ret & _PAGE_PRESENT)
+                ret = machine_to_phys(XMADDR(ret)).paddr;
+        return ret;
+}
+unsigned long xen_pgd_val(pgd_t pgd)
+{
+        unsigned long ret = pgd.pgd;
+        if (ret)
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        return ret;
+}
+pte_t xen_make_pte(unsigned long pte)
+{
+        if (pte & _PAGE_PRESENT)
+                pte = phys_to_machine(XPADDR(pte)).maddr;
+        return (pte_t){ pte };
+}
+pgd_t xen_make_pgd(unsigned long pgd)
+{
+        if (pgd & _PAGE_PRESENT)
+                pgd = phys_to_machine(XPADDR(pgd)).maddr;
+        return (pgd_t){ pgd };
+}
+#endif  /* CONFIG_X86_PAE */
+/*
+  (Yet another) pagetable walker.  This one is intended for pinning a
+  pagetable.  This means that it walks a pagetable and calls the
+  callback function on each page it finds making up the page table,
+  at every level.  It walks the entire pagetable, but it only bothers
+  pinning pte pages which are below pte_limit.  In the normal case
+  this will be TASK_SIZE, but at boot we need to pin up to
+  FIXADDR_TOP.  But the important bit is that we don't pin beyond
+  there, because then we start getting into Xen's ptes.
+*/
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+                    unsigned long limit)
+{
+        pgd_t *pgd = pgd_base;
+        int flush = 0;
+        unsigned long addr = 0;
+        unsigned long pgd_next;
+        BUG_ON(limit > FIXADDR_TOP);
+        if (xen_feature(XENFEAT_auto_translated_physmap))
+                return 0;
+        for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+                pud_t *pud;
+                unsigned long pud_limit, pud_next;
+                pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+                if (!pgd_val(*pgd))
+                        continue;
+                pud = pud_offset(pgd, 0);
+                if (PTRS_PER_PUD > 1) /* not folded */
+                        flush |= (*func)(virt_to_page(pud), 0);
+                for (; addr != pud_limit; pud++, addr = pud_next) {
+                        pmd_t *pmd;
+                        unsigned long pmd_limit;
+                        pud_next = pud_addr_end(addr, pud_limit);
+                        if (pud_next < limit)
+                                pmd_limit = pud_next;
+                        else
+                                pmd_limit = limit;
+                        if (pud_none(*pud))
+                                continue;
+                        pmd = pmd_offset(pud, 0);
+                        if (PTRS_PER_PMD > 1) /* not folded */
+                                flush |= (*func)(virt_to_page(pmd), 0);
+                        for (; addr != pmd_limit; pmd++) {
+                                addr += (PAGE_SIZE * PTRS_PER_PTE);
+                                if ((pmd_limit-1) < (addr-1)) {
+                                        addr = pmd_limit;
+                                        break;
+                                }
+                                if (pmd_none(*pmd))
+                                        continue;
+                                flush |= (*func)(pmd_page(*pmd), 0);
+                        }
+                }
+        }
+        flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+        return flush;
+}
+static int pin_page(struct page *page, unsigned flags)
+{
+        unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
+        int flush;
+        if (pgfl)
+                flush = 0;              /* already pinned */
+        else if (PageHighMem(page))
+                /* kmaps need flushing if we found an unpinned
+                   highpage */
+                flush = 1;
+        else {
+                void *pt = lowmem_page_address(page);
+                unsigned long pfn = page_to_pfn(page);
+                struct multicall_space mcs = __xen_mc_entry(0);
+                flush = 0;
+                MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+                                        pfn_pte(pfn, PAGE_KERNEL_RO),
+                                        flags);
+        }
+        return flush;
+}
+/* This is called just after a mm has been created, but it has not
+   been used yet.  We need to make sure that its pagetable is all
+   read-only, and can be pinned. */
+void xen_pgd_pin(pgd_t *pgd)
+{
+        struct multicall_space mcs;
+        struct mmuext_op *op;
+        xen_mc_batch();
+        if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+                /* re-enable interrupts for kmap_flush_unused */
+                xen_mc_issue(0);
+                kmap_flush_unused();
+                xen_mc_batch();
+        }
+        mcs = __xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+#ifdef CONFIG_X86_PAE
+        op->cmd = MMUEXT_PIN_L3_TABLE;
+#else
+        op->cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+        op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(0);
+}
+/* The init_mm pagetable is really pinned as soon as its created, but
+   that's before we have page structures to store the bits.  So do all
+   the book-keeping now. */
+static __init int mark_pinned(struct page *page, unsigned flags)
+{
+        SetPagePinned(page);
+        return 0;
+}
+void __init xen_mark_init_mm_pinned(void)
+{
+        pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+}
+static int unpin_page(struct page *page, unsigned flags)
+{
+        unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
+        if (pgfl && !PageHighMem(page)) {
+                void *pt = lowmem_page_address(page);
+                unsigned long pfn = page_to_pfn(page);
+                struct multicall_space mcs = __xen_mc_entry(0);
+                MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+                                        pfn_pte(pfn, PAGE_KERNEL),
+                                        flags);
+        }
+        return 0;               /* never need to flush on unpin */
+}
+/* Release a pagetables pages back as normal RW */
+static void xen_pgd_unpin(pgd_t *pgd)
+{
+        struct mmuext_op *op;
+        struct multicall_space mcs;
+        xen_mc_batch();
+        mcs = __xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_UNPIN_TABLE;
+        op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        pgd_walk(pgd, unpin_page, TASK_SIZE);
+        xen_mc_issue(0);
+}
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+        spin_lock(&next->page_table_lock);
+        xen_pgd_pin(next->pgd);
+        spin_unlock(&next->page_table_lock);
+}
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+        spin_lock(&mm->page_table_lock);
+        xen_pgd_pin(mm->pgd);
+        spin_unlock(&mm->page_table_lock);
+}
+#ifdef CONFIG_SMP
+/* Another cpu may still have their %cr3 pointing at the pagetable, so
+   we need to repoint it somewhere else before we can unpin it. */
+static void drop_other_mm_ref(void *info)
+{
+        struct mm_struct *mm = info;
+        if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+                leave_mm(smp_processor_id());
+}
+static void drop_mm_ref(struct mm_struct *mm)
+{
+        if (current->active_mm == mm) {
+                if (current->mm == mm)
+                        load_cr3(swapper_pg_dir);
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (!cpus_empty(mm->cpu_vm_mask))
+                xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
+                                           mm, 1);
+}
+#else
+static void drop_mm_ref(struct mm_struct *mm)
+{
+        if (current->active_mm == mm)
+                load_cr3(swapper_pg_dir);
+}
+#endif
+/*
+ * While a process runs, Xen pins its pagetables, which means that the
+ * hypervisor forces it to be read-only, and it controls all updates
+ * to it.  This means that all pagetable updates have to go via the
+ * hypervisor, which is moderately expensive.
+ *
+ * Since we're pulling the pagetable down, we switch to use init_mm,
+ * unpin old process pagetable and mark it all read-write, which
+ * allows further operations on it to be simple memory accesses.
+ *
+ * The only subtle point is that another CPU may be still using the
+ * pagetable because of lazy tlb flushing.  This means we need need to
+ * switch all CPUs off this pagetable before we can unpin it.
+ */
+void xen_exit_mmap(struct mm_struct *mm)
+{
+        get_cpu();              /* make sure we don't move around */
+        drop_mm_ref(mm);
+        put_cpu();
+        spin_lock(&mm->page_table_lock);
+        /* pgd may not be pinned in the error exit path of execve */
+        if (PagePinned(virt_to_page(mm->pgd)))
+                xen_pgd_unpin(mm->pgd);
+        spin_unlock(&mm->page_table_lock);
+}
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
new file mode 100644
index 000000000000..c9ff27f3ac3a
--- /dev/null
+++ b/arch/x86/xen/mmu.h
@@ -0,0 +1,60 @@
+#ifndef _XEN_MMU_H
+#include <linux/linkage.h>
+#include <asm/page.h>
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+void xen_set_pte(pte_t *ptep, pte_t pteval);
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pteval);
+void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+void xen_exit_mmap(struct mm_struct *mm);
+void xen_pgd_pin(pgd_t *pgd);
+//void xen_pgd_unpin(pgd_t *pgd);
+#ifdef CONFIG_X86_PAE
+unsigned long long xen_pte_val(pte_t);
+unsigned long long xen_pmd_val(pmd_t);
+unsigned long long xen_pgd_val(pgd_t);
+pte_t xen_make_pte(unsigned long long);
+pmd_t xen_make_pmd(unsigned long long);
+pgd_t xen_make_pgd(unsigned long long);
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pteval);
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_set_pud(pud_t *ptr, pud_t val);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+#else
+unsigned long xen_pte_val(pte_t);
+unsigned long xen_pmd_val(pmd_t);
+unsigned long xen_pgd_val(pgd_t);
+pte_t xen_make_pte(unsigned long);
+pmd_t xen_make_pmd(unsigned long);
+pgd_t xen_make_pgd(unsigned long);
+#endif
+#endif  /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
new file mode 100644
index 000000000000..c837e8e463db
--- /dev/null
+++ b/arch/x86/xen/multicalls.c
@@ -0,0 +1,90 @@
+/*
+ * Xen hypercall batching.
+ *
+ * Xen allows multiple hypercalls to be issued at once, using the
+ * multicall interface.  This allows the cost of trapping into the
+ * hypervisor to be amortized over several calls.
+ *
+ * This file implements a simple interface for multicalls.  There's a
+ * per-cpu buffer of outstanding multicalls.  When you want to queue a
+ * multicall for issuing, you can allocate a multicall slot for the
+ * call and its arguments, along with storage for space which is
+ * pointed to by the arguments (for passing pointers to structures,
+ * etc).  When the multicall is actually issued, all the space for the
+ * commands and allocated memory is freed for reuse.
+ *
+ * Multicalls are flushed whenever any of the buffers get full, or
+ * when explicitly requested.  There's no way to get per-multicall
+ * return results back.  It will BUG if any of the multicalls fail.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/xen/hypercall.h>
+#include "multicalls.h"
+#define MC_BATCH        32
+#define MC_ARGS         (MC_BATCH * 16 / sizeof(u64))
+struct mc_buffer {
+        struct multicall_entry entries[MC_BATCH];
+        u64 args[MC_ARGS];
+        unsigned mcidx, argidx;
+};
+static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
+void xen_mc_flush(void)
+{
+        struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+        int ret = 0;
+        unsigned long flags;
+        BUG_ON(preemptible());
+        /* Disable interrupts in case someone comes in and queues
+           something in the middle */
+        local_irq_save(flags);
+        if (b->mcidx) {
+                int i;
+                if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
+                        BUG();
+                for (i = 0; i < b->mcidx; i++)
+                        if (b->entries[i].result < 0)
+                                ret++;
+                b->mcidx = 0;
+                b->argidx = 0;
+        } else
+                BUG_ON(b->argidx != 0);
+        local_irq_restore(flags);
+        BUG_ON(ret);
+}
+struct multicall_space __xen_mc_entry(size_t args)
+{
+        struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+        struct multicall_space ret;
+        unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
+        BUG_ON(preemptible());
+        BUG_ON(argspace > MC_ARGS);
+        if (b->mcidx == MC_BATCH ||
+            (b->argidx + argspace) > MC_ARGS)
+                xen_mc_flush();
+        ret.mc = &b->entries[b->mcidx];
+        b->mcidx++;
+        ret.args = &b->args[b->argidx];
+        b->argidx += argspace;
+        return ret;
+}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
new file mode 100644
index 000000000000..e6f7530b156c
--- /dev/null
+++ b/arch/x86/xen/multicalls.h
@@ -0,0 +1,45 @@
+#ifndef _XEN_MULTICALLS_H
+#define _XEN_MULTICALLS_H
+#include "xen-ops.h"
+/* Multicalls */
+struct multicall_space
+{
+        struct multicall_entry *mc;
+        void *args;
+};
+/* Allocate room for a multicall and its args */
+struct multicall_space __xen_mc_entry(size_t args);
+DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
+/* Call to start a batch of multiple __xen_mc_entry()s.  Must be
+   paired with xen_mc_issue() */
+static inline void xen_mc_batch(void)
+{
+        /* need to disable interrupts until this entry is complete */
+        local_irq_save(__get_cpu_var(xen_mc_irq_flags));
+}
+static inline struct multicall_space xen_mc_entry(size_t args)
+{
+        xen_mc_batch();
+        return __xen_mc_entry(args);
+}
+/* Flush all pending multicalls */
+void xen_mc_flush(void);
+/* Issue a multicall if we're not in a lazy mode */
+static inline void xen_mc_issue(unsigned mode)
+{
+        if ((xen_get_lazy_mode() & mode) == 0)
+                xen_mc_flush();
+        /* restore flags saved in xen_mc_batch */
+        local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+}
+#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
new file mode 100644
index 000000000000..f84e77226646
--- /dev/null
+++ b/arch/x86/xen/setup.c
@@ -0,0 +1,111 @@
+/*
+ * Machine specific setup for xen
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+#include <asm/elf.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+#include "xen-ops.h"
+#include "vdso.h"
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+unsigned long *phys_to_machine_mapping;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ **/
+char * __init xen_memory_setup(void)
+{
+        unsigned long max_pfn = xen_start_info->nr_pages;
+        e820.nr_map = 0;
+        add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+        return "Xen";
+}
+static void xen_idle(void)
+{
+        local_irq_disable();
+        if (need_resched())
+                local_irq_enable();
+        else {
+                current_thread_info()->status &= ~TS_POLLING;
+                smp_mb__after_clear_bit();
+                safe_halt();
+                current_thread_info()->status |= TS_POLLING;
+        }
+}
+/*
+ * Set the bit indicating "nosegneg" library variants should be used.
+ */
+static void fiddle_vdso(void)
+{
+        extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S.  */
+        extern char vsyscall_int80_start;
+        u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK +
+                             &vsyscall_int80_start);
+        *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+}
+void __init xen_arch_setup(void)
+{
+        struct physdev_set_iopl set_iopl;
+        int rc;
+        HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+        HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
+        if (!xen_feature(XENFEAT_auto_translated_physmap))
+                HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
+        HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
+                                 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+        set_iopl.iopl = 1;
+        rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+        if (rc != 0)
+                printk(KERN_INFO "physdev_op failed %d\n", rc);
+#ifdef CONFIG_ACPI
+        if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+                printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+                disable_acpi();
+        }
+#endif
+        memcpy(boot_command_line, xen_start_info->cmd_line,
+               MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+               COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+        pm_idle = xen_idle;
+#ifdef CONFIG_SMP
+        /* fill cpus_possible with all available cpus */
+        xen_fill_possible_map();
+#endif
+        paravirt_disable_iospace();
+        fiddle_vdso();
+}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
new file mode 100644
index 000000000000..557b8e24706a
--- /dev/null
+++ b/arch/x86/xen/smp.c
@@ -0,0 +1,404 @@
+/*
+ * Xen SMP support
+ *
+ * This file implements the Xen versions of smp_ops.  SMP under Xen is
+ * very straightforward.  Bringing a CPU up is simply a matter of
+ * loading its initial context and setting it running.
+ *
+ * IPIs are handled through the Xen event mechanism.
+ *
+ * Because virtual CPUs can be scheduled onto any real CPU, there's no
+ * useful topology information for the kernel to make use of.  As a
+ * result, all CPUs are treated as if they're single-core and
+ * single-threaded.
+ *
+ * This does not handle HOTPLUG_CPU yet.
+ */
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/smp.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cpu.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+#include <xen/page.h>
+#include <xen/events.h>
+#include "xen-ops.h"
+#include "mmu.h"
+static cpumask_t cpu_initialized_map;
+static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, callfunc_irq);
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+struct call_data_struct {
+        void (*func) (void *info);
+        void *info;
+        atomic_t started;
+        atomic_t finished;
+        int wait;
+};
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
+static struct call_data_struct *call_data;
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
+{
+        return IRQ_HANDLED;
+}
+static __cpuinit void cpu_bringup_and_idle(void)
+{
+        int cpu = smp_processor_id();
+        cpu_init();
+        preempt_disable();
+        per_cpu(cpu_state, cpu) = CPU_ONLINE;
+        xen_setup_cpu_clockevents();
+        /* We can take interrupts now: we're officially "up". */
+        local_irq_enable();
+        wmb();                  /* make sure everything is out */
+        cpu_idle();
+}
+static int xen_smp_intr_init(unsigned int cpu)
+{
+        int rc;
+        const char *resched_name, *callfunc_name;
+        per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+        resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+        rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
+                                    cpu,
+                                    xen_reschedule_interrupt,
+                                    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                    resched_name,
+                                    NULL);
+        if (rc < 0)
+                goto fail;
+        per_cpu(resched_irq, cpu) = rc;
+        callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+        rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
+                                    cpu,
+                                    xen_call_function_interrupt,
+                                    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                    callfunc_name,
+                                    NULL);
+        if (rc < 0)
+                goto fail;
+        per_cpu(callfunc_irq, cpu) = rc;
+        return 0;
+ fail:
+        if (per_cpu(resched_irq, cpu) >= 0)
+                unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+        if (per_cpu(callfunc_irq, cpu) >= 0)
+                unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+        return rc;
+}
+void __init xen_fill_possible_map(void)
+{
+        int i, rc;
+        for (i = 0; i < NR_CPUS; i++) {
+                rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+                if (rc >= 0)
+                        cpu_set(i, cpu_possible_map);
+        }
+}
+void __init xen_smp_prepare_boot_cpu(void)
+{
+        int cpu;
+        BUG_ON(smp_processor_id() != 0);
+        native_smp_prepare_boot_cpu();
+        /* We've switched to the "real" per-cpu gdt, so make sure the
+           old memory can be recycled */
+        make_lowmem_page_readwrite(&per_cpu__gdt_page);
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                cpus_clear(cpu_sibling_map[cpu]);
+                cpus_clear(cpu_core_map[cpu]);
+        }
+        xen_setup_vcpu_info_placement();
+}
+void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+{
+        unsigned cpu;
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                cpus_clear(cpu_sibling_map[cpu]);
+                cpus_clear(cpu_core_map[cpu]);
+        }
+        smp_store_cpu_info(0);
+        set_cpu_sibling_map(0);
+        if (xen_smp_intr_init(0))
+                BUG();
+        cpu_initialized_map = cpumask_of_cpu(0);
+        /* Restrict the possible_map according to max_cpus. */
+        while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+                for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+                        continue;
+                cpu_clear(cpu, cpu_possible_map);
+        }
+        for_each_possible_cpu (cpu) {
+                struct task_struct *idle;
+                if (cpu == 0)
+                        continue;
+                idle = fork_idle(cpu);
+                if (IS_ERR(idle))
+                        panic("failed fork for CPU %d", cpu);
+                cpu_set(cpu, cpu_present_map);
+        }
+        //init_xenbus_allowed_cpumask();
+}
+static __cpuinit int
+cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
+{
+        struct vcpu_guest_context *ctxt;
+        struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+        if (cpu_test_and_set(cpu, cpu_initialized_map))
+                return 0;
+        ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+        if (ctxt == NULL)
+                return -ENOMEM;
+        ctxt->flags = VGCF_IN_KERNEL;
+        ctxt->user_regs.ds = __USER_DS;
+        ctxt->user_regs.es = __USER_DS;
+        ctxt->user_regs.fs = __KERNEL_PERCPU;
+        ctxt->user_regs.gs = 0;
+        ctxt->user_regs.ss = __KERNEL_DS;
+        ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+        ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+        memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+        xen_copy_trap_info(ctxt->trap_ctxt);
+        ctxt->ldt_ents = 0;
+        BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
+        make_lowmem_page_readonly(gdt->gdt);
+        ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
+        ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
+        ctxt->user_regs.cs = __KERNEL_CS;
+        ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+        ctxt->kernel_ss = __KERNEL_DS;
+        ctxt->kernel_sp = idle->thread.esp0;
+        ctxt->event_callback_cs     = __KERNEL_CS;
+        ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
+        ctxt->failsafe_callback_cs  = __KERNEL_CS;
+        ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
+        per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+        ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+        if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
+                BUG();
+        kfree(ctxt);
+        return 0;
+}
+int __cpuinit xen_cpu_up(unsigned int cpu)
+{
+        struct task_struct *idle = idle_task(cpu);
+        int rc;
+#if 0
+        rc = cpu_up_check(cpu);
+        if (rc)
+                return rc;
+#endif
+        init_gdt(cpu);
+        per_cpu(current_task, cpu) = idle;
+        irq_ctx_init(cpu);
+        xen_setup_timer(cpu);
+        /* make sure interrupts start blocked */
+        per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
+        rc = cpu_initialize_context(cpu, idle);
+        if (rc)
+                return rc;
+        if (num_online_cpus() == 1)
+                alternatives_smp_switch(1);
+        rc = xen_smp_intr_init(cpu);
+        if (rc)
+                return rc;
+        smp_store_cpu_info(cpu);
+        set_cpu_sibling_map(cpu);
+        /* This must be done before setting cpu_online_map */
+        wmb();
+        cpu_set(cpu, cpu_online_map);
+        rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
+        BUG_ON(rc);
+        return 0;
+}
+void xen_smp_cpus_done(unsigned int max_cpus)
+{
+}
+static void stop_self(void *v)
+{
+        int cpu = smp_processor_id();
+        /* make sure we're not pinning something down */
+        load_cr3(swapper_pg_dir);
+        /* should set up a minimal gdt */
+        HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
+        BUG();
+}
+void xen_smp_send_stop(void)
+{
+        smp_call_function(stop_self, NULL, 0, 0);
+}
+void xen_smp_send_reschedule(int cpu)
+{
+        xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
+}
+static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+{
+        unsigned cpu;
+        cpus_and(mask, mask, cpu_online_map);
+        for_each_cpu_mask(cpu, mask)
+                xen_send_IPI_one(cpu, vector);
+}
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
+{
+        void (*func) (void *info) = call_data->func;
+        void *info = call_data->info;
+        int wait = call_data->wait;
+        /*
+         * Notify initiating CPU that I've grabbed the data and am
+         * about to execute the function
+         */
+        mb();
+        atomic_inc(&call_data->started);
+        /*
+         * At this point the info structure may be out of scope unless wait==1
+         */
+        irq_enter();
+        (*func)(info);
+        irq_exit();
+        if (wait) {
+                mb();           /* commit everything before setting finished */
+                atomic_inc(&call_data->finished);
+        }
+        return IRQ_HANDLED;
+}
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+                               void *info, int wait)
+{
+        struct call_data_struct data;
+        int cpus;
+        /* Holding any lock stops cpus from going down. */
+        spin_lock(&call_lock);
+        cpu_clear(smp_processor_id(), mask);
+        cpus = cpus_weight(mask);
+        if (!cpus) {
+                spin_unlock(&call_lock);
+                return 0;
+        }
+        /* Can deadlock when called with interrupts disabled */
+        WARN_ON(irqs_disabled());
+        data.func = func;
+        data.info = info;
+        atomic_set(&data.started, 0);
+        data.wait = wait;
+        if (wait)
+                atomic_set(&data.finished, 0);
+        call_data = &data;
+        mb();                   /* write everything before IPI */
+        /* Send a message to other CPUs and wait for them to respond */
+        xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+        /* Make sure other vcpus get a chance to run.
+           XXX too severe?  Maybe we should check the other CPU's states? */
+        HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+        /* Wait for response */
+        while (atomic_read(&data.started) != cpus ||
+               (wait && atomic_read(&data.finished) != cpus))
+                cpu_relax();
+        spin_unlock(&call_lock);
+        return 0;
+}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
new file mode 100644
index 000000000000..dfd6db69ead5
--- /dev/null
+++ b/arch/x86/xen/time.c
@@ -0,0 +1,593 @@
+/*
+ * Xen time implementation.
+ *
+ * This is implemented in terms of a clocksource driver which uses
+ * the hypervisor clock as a nanosecond timebase, and a clockevent
+ * driver which uses the hypervisor's timer mechanism.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/kernel_stat.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include "xen-ops.h"
+#define XEN_SHIFT 22
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP      100000
+#define NS_PER_TICK     (1000000000LL / HZ)
+static cycle_t xen_clocksource_read(void);
+/* These are perodically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+        u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+        u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+        u32 tsc_to_nsec_mul;
+        int tsc_shift;
+        u32 version;
+};
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+/* runstate info updated by Xen */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+/* unused ns of stolen and blocked time */
+static DEFINE_PER_CPU(u64, residual_stolen);
+static DEFINE_PER_CPU(u64, residual_blocked);
+/* return an consistent snapshot of 64-bit time/counter value */
+static u64 get64(const u64 *p)
+{
+        u64 ret;
+        if (BITS_PER_LONG < 64) {
+                u32 *p32 = (u32 *)p;
+                u32 h, l;
+                /*
+                 * Read high then low, and then make sure high is
+                 * still the same; this will only loop if low wraps
+                 * and carries into high.
+                 * XXX some clean way to make this endian-proof?
+                 */
+                do {
+                        h = p32[1];
+                        barrier();
+                        l = p32[0];
+                        barrier();
+                } while (p32[1] != h);
+                ret = (((u64)h) << 32) | l;
+        } else
+                ret = *p;
+        return ret;
+}
+/*
+ * Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+        u64 state_time;
+        struct vcpu_runstate_info *state;
+        BUG_ON(preemptible());
+        state = &__get_cpu_var(runstate);
+        /*
+         * The runstate info is always updated by the hypervisor on
+         * the current CPU, so there's no need to use anything
+         * stronger than a compiler barrier when fetching it.
+         */
+        do {
+                state_time = get64(&state->state_entry_time);
+                barrier();
+                *res = *state;
+                barrier();
+        } while (get64(&state->state_entry_time) != state_time);
+}
+static void setup_runstate_info(int cpu)
+{
+        struct vcpu_register_runstate_memory_area area;
+        area.addr.v = &per_cpu(runstate, cpu);
+        if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
+                               cpu, &area))
+                BUG();
+}
+static void do_stolen_accounting(void)
+{
+        struct vcpu_runstate_info state;
+        struct vcpu_runstate_info *snap;
+        s64 blocked, runnable, offline, stolen;
+        cputime_t ticks;
+        get_runstate_snapshot(&state);
+        WARN_ON(state.state != RUNSTATE_running);
+        snap = &__get_cpu_var(runstate_snapshot);
+        /* work out how much time the VCPU has not been runn*ing*  */
+        blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
+        runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
+        offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
+        *snap = state;
+        /* Add the appropriate number of ticks of stolen time,
+           including any left-overs from last time.  Passing NULL to
+           account_steal_time accounts the time as stolen. */
+        stolen = runnable + offline + __get_cpu_var(residual_stolen);
+        if (stolen < 0)
+                stolen = 0;
+        ticks = 0;
+        while (stolen >= NS_PER_TICK) {
+                ticks++;
+                stolen -= NS_PER_TICK;
+        }
+        __get_cpu_var(residual_stolen) = stolen;
+        account_steal_time(NULL, ticks);
+        /* Add the appropriate number of ticks of blocked time,
+           including any left-overs from last time.  Passing idle to
+           account_steal_time accounts the time as idle/wait. */
+        blocked += __get_cpu_var(residual_blocked);
+        if (blocked < 0)
+                blocked = 0;
+        ticks = 0;
+        while (blocked >= NS_PER_TICK) {
+                ticks++;
+                blocked -= NS_PER_TICK;
+        }
+        __get_cpu_var(residual_blocked) = blocked;
+        account_steal_time(idle_task(smp_processor_id()), ticks);
+}
+/*
+ * Xen sched_clock implementation.  Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+unsigned long long xen_sched_clock(void)
+{
+        struct vcpu_runstate_info state;
+        cycle_t now;
+        u64 ret;
+        s64 offset;
+        /*
+         * Ideally sched_clock should be called on a per-cpu basis
+         * anyway, so preempt should already be disabled, but that's
+         * not current practice at the moment.
+         */
+        preempt_disable();
+        now = xen_clocksource_read();
+        get_runstate_snapshot(&state);
+        WARN_ON(state.state != RUNSTATE_running);
+        offset = now - state.state_entry_time;
+        if (offset < 0)
+                offset = 0;
+        ret = state.time[RUNSTATE_blocked] +
+                state.time[RUNSTATE_running] +
+                offset;
+        preempt_enable();
+        return ret;
+}
+/* Get the CPU speed from Xen */
+unsigned long xen_cpu_khz(void)
+{
+        u64 cpu_khz = 1000000ULL << 32;
+        const struct vcpu_time_info *info =
+                &HYPERVISOR_shared_info->vcpu_info[0].time;
+        do_div(cpu_khz, info->tsc_to_system_mul);
+        if (info->tsc_shift < 0)
+                cpu_khz <<= -info->tsc_shift;
+        else
+                cpu_khz >>= info->tsc_shift;
+        return cpu_khz;
+}
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static unsigned get_time_values_from_xen(void)
+{
+        struct vcpu_time_info   *src;
+        struct shadow_time_info *dst;
+        /* src is shared memory with the hypervisor, so we need to
+           make sure we get a consistent snapshot, even in the face of
+           being preempted. */
+        src = &__get_cpu_var(xen_vcpu)->time;
+        dst = &__get_cpu_var(shadow_time);
+        do {
+                dst->version = src->version;
+                rmb();          /* fetch version before data */
+                dst->tsc_timestamp     = src->tsc_timestamp;
+                dst->system_timestamp  = src->system_time;
+                dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+                dst->tsc_shift         = src->tsc_shift;
+                rmb();          /* test version after fetching data */
+        } while ((src->version & 1) | (dst->version ^ src->version));
+        return dst->version;
+}
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+        u64 product;
+#ifdef __i386__
+        u32 tmp1, tmp2;
+#endif
+        if (shift < 0)
+                delta >>= -shift;
+        else
+                delta <<= shift;
+#ifdef __i386__
+        __asm__ (
+                "mul  %5       ; "
+                "mov  %4,%%eax ; "
+                "mov  %%edx,%4 ; "
+                "mul  %5       ; "
+                "xor  %5,%5    ; "
+                "add  %4,%%eax ; "
+                "adc  %5,%%edx ; "
+                : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+                : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+        __asm__ (
+                "mul %%rdx ; shrd $32,%%rdx,%%rax"
+                : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+        return product;
+}
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+        u64 now, delta;
+        now = native_read_tsc();
+        delta = now - shadow->tsc_timestamp;
+        return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+static cycle_t xen_clocksource_read(void)
+{
+        struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+        cycle_t ret;
+        unsigned version;
+        do {
+                version = get_time_values_from_xen();
+                barrier();
+                ret = shadow->system_timestamp + get_nsec_offset(shadow);
+                barrier();
+        } while (version != __get_cpu_var(xen_vcpu)->time.version);
+        put_cpu_var(shadow_time);
+        return ret;
+}
+static void xen_read_wallclock(struct timespec *ts)
+{
+        const struct shared_info *s = HYPERVISOR_shared_info;
+        u32 version;
+        u64 delta;
+        struct timespec now;
+        /* get wallclock at system boot */
+        do {
+                version = s->wc_version;
+                rmb();          /* fetch version before time */
+                now.tv_sec  = s->wc_sec;
+                now.tv_nsec = s->wc_nsec;
+                rmb();          /* fetch time before checking version */
+        } while ((s->wc_version & 1) | (version ^ s->wc_version));
+        delta = xen_clocksource_read(); /* time since system boot */
+        delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+        now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+        now.tv_sec = delta;
+        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
+unsigned long xen_get_wallclock(void)
+{
+        struct timespec ts;
+        xen_read_wallclock(&ts);
+        return ts.tv_sec;
+}
+int xen_set_wallclock(unsigned long now)
+{
+        /* do nothing for domU */
+        return -1;
+}
+static struct clocksource xen_clocksource __read_mostly = {
+        .name = "xen",
+        .rating = 400,
+        .read = xen_clocksource_read,
+        .mask = ~0,
+        .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
+        .shift = XEN_SHIFT,
+        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+/*
+   Xen clockevent implementation
+   Xen has two clockevent implementations:
+   The old timer_op one works with all released versions of Xen prior
+   to version 3.0.4.  This version of the hypervisor provides a
+   single-shot timer with nanosecond resolution.  However, sharing the
+   same event channel is a 100Hz tick which is delivered while the
+   vcpu is running.  We don't care about or use this tick, but it will
+   cause the core time code to think the timer fired too soon, and
+   will end up resetting it each time.  It could be filtered, but
+   doing so has complications when the ktime clocksource is not yet
+   the xen clocksource (ie, at boot time).
+   The new vcpu_op-based timer interface allows the tick timer period
+   to be changed or turned off.  The tick timer is not useful as a
+   periodic timer because events are only delivered to running vcpus.
+   The one-shot timer can report when a timeout is in the past, so
+   set_next_event is capable of returning -ETIME when appropriate.
+   This interface is used when available.
+*/
+/*
+  Get a hypervisor absolute time.  In theory we could maintain an
+  offset between the kernel's time and the hypervisor's time, and
+  apply that to a kernel's absolute timeout.  Unfortunately the
+  hypervisor and kernel times can drift even if the kernel is using
+  the Xen clocksource, because ntp can warp the kernel's clocksource.
+*/
+static s64 get_abs_timeout(unsigned long delta)
+{
+        return xen_clocksource_read() + delta;
+}
+static void xen_timerop_set_mode(enum clock_event_mode mode,
+                                 struct clock_event_device *evt)
+{
+        switch (mode) {
+        case CLOCK_EVT_MODE_PERIODIC:
+                /* unsupported */
+                WARN_ON(1);
+                break;
+        case CLOCK_EVT_MODE_ONESHOT:
+        case CLOCK_EVT_MODE_RESUME:
+                break;
+        case CLOCK_EVT_MODE_UNUSED:
+        case CLOCK_EVT_MODE_SHUTDOWN:
+                HYPERVISOR_set_timer_op(0);  /* cancel timeout */
+                break;
+        }
+}
+static int xen_timerop_set_next_event(unsigned long delta,
+                                      struct clock_event_device *evt)
+{
+        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+        if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+                BUG();
+        /* We may have missed the deadline, but there's no real way of
+           knowing for sure.  If the event was in the past, then we'll
+           get an immediate interrupt. */
+        return 0;
+}
+static const struct clock_event_device xen_timerop_clockevent = {
+        .name = "xen",
+        .features = CLOCK_EVT_FEAT_ONESHOT,
+        .max_delta_ns = 0xffffffff,
+        .min_delta_ns = TIMER_SLOP,
+        .mult = 1,
+        .shift = 0,
+        .rating = 500,
+        .set_mode = xen_timerop_set_mode,
+        .set_next_event = xen_timerop_set_next_event,
+};
+static void xen_vcpuop_set_mode(enum clock_event_mode mode,
+                                struct clock_event_device *evt)
+{
+        int cpu = smp_processor_id();
+        switch (mode) {
+        case CLOCK_EVT_MODE_PERIODIC:
+                WARN_ON(1);     /* unsupported */
+                break;
+        case CLOCK_EVT_MODE_ONESHOT:
+                if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+                        BUG();
+                break;
+        case CLOCK_EVT_MODE_UNUSED:
+        case CLOCK_EVT_MODE_SHUTDOWN:
+                if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
+                    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+                        BUG();
+                break;
+        case CLOCK_EVT_MODE_RESUME:
+                break;
+        }
+}
+static int xen_vcpuop_set_next_event(unsigned long delta,
+                                     struct clock_event_device *evt)
+{
+        int cpu = smp_processor_id();
+        struct vcpu_set_singleshot_timer single;
+        int ret;
+        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+        single.timeout_abs_ns = get_abs_timeout(delta);
+        single.flags = VCPU_SSHOTTMR_future;
+        ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
+        BUG_ON(ret != 0 && ret != -ETIME);
+        return ret;
+}
+static const struct clock_event_device xen_vcpuop_clockevent = {
+        .name = "xen",
+        .features = CLOCK_EVT_FEAT_ONESHOT,
+        .max_delta_ns = 0xffffffff,
+        .min_delta_ns = TIMER_SLOP,
+        .mult = 1,
+        .shift = 0,
+        .rating = 500,
+        .set_mode = xen_vcpuop_set_mode,
+        .set_next_event = xen_vcpuop_set_next_event,
+};
+static const struct clock_event_device *xen_clockevent =
+        &xen_timerop_clockevent;
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
+{
+        struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
+        irqreturn_t ret;
+        ret = IRQ_NONE;
+        if (evt->event_handler) {
+                evt->event_handler(evt);
+                ret = IRQ_HANDLED;
+        }
+        do_stolen_accounting();
+        return ret;
+}
+void xen_setup_timer(int cpu)
+{
+        const char *name;
+        struct clock_event_device *evt;
+        int irq;
+        printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
+        name = kasprintf(GFP_KERNEL, "timer%d", cpu);
+        if (!name)
+                name = "<timer kasprintf failed>";
+        irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
+                                      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                      name, NULL);
+        evt = &per_cpu(xen_clock_events, cpu);
+        memcpy(evt, xen_clockevent, sizeof(*evt));
+        evt->cpumask = cpumask_of_cpu(cpu);
+        evt->irq = irq;
+        setup_runstate_info(cpu);
+}
+void xen_setup_cpu_clockevents(void)
+{
+        BUG_ON(preemptible());
+        clockevents_register_device(&__get_cpu_var(xen_clock_events));
+}
+__init void xen_time_init(void)
+{
+        int cpu = smp_processor_id();
+        get_time_values_from_xen();
+        clocksource_register(&xen_clocksource);
+        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
+                /* Successfully turned off 100Hz tick, so we have the
+                   vcpuop-based timer interface */
+                printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
+                xen_clockevent = &xen_vcpuop_clockevent;
+        }
+        /* Set initial system time with full resolution */
+        xen_read_wallclock(&xtime);
+        set_normalized_timespec(&wall_to_monotonic,
+                                -xtime.tv_sec, -xtime.tv_nsec);
+        tsc_disable = 0;
+        xen_setup_timer(cpu);
+        xen_setup_cpu_clockevents();
+}
diff --git a/arch/x86/xen/vdso.h b/arch/x86/xen/vdso.h
new file mode 100644
index 000000000000..861fedfe5230
--- /dev/null
+++ b/arch/x86/xen/vdso.h
@@ -0,0 +1,4 @@
+/* Bit used for the pseudo-hwcap for non-negative segments.  We use
+   bit 1 to avoid bugs in some versions of glibc when bit 0 is
+   used; the choice is otherwise arbitrary. */
+#define VDSO_NOTE_NONEGSEG_BIT  1
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644
index 000000000000..1a43b60c0c62
--- /dev/null
+++ b/arch/x86/xen/xen-asm.S
@@ -0,0 +1,291 @@
+/*
+        Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+        The inline versions are the same as the direct-use versions, with the
+        pre- and post-amble chopped off.
+        This code is encoded for size rather than absolute efficiency,
+        with a view to being able to inline as much as possible.
+        We only bother with direct forms (ie, vcpu in pda) of the operations
+        here; the indirect forms are better handled in C, since they're
+        generally too large to inline anyway.
+ */
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+#include <xen/interface/xen.h>
+#define RELOC(x, v)     .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)     .globl x##_end; x##_end=.
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI  0x80000000
+/*
+        Enable events.  This clears the event mask and tests the pending
+        event status with one and operation.  If there are pending
+        events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+        /* Clear mask and test pending */
+        andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+        /* Preempt here doesn't matter because that will deal with
+           any pending interrupts.  The pending check may end up being
+           run on the wrong CPU, but that doesn't hurt. */
+        jz 1f
+2:      call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+        ret
+        ENDPROC(xen_irq_enable_direct)
+        RELOC(xen_irq_enable_direct, 2b+1)
+/*
+        Disabling events is simply a matter of making the event mask
+        non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+        movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+        ret
+        ENDPROC(xen_irq_disable_direct)
+        RELOC(xen_irq_disable_direct, 0)
+/*
+        (xen_)save_fl is used to get the current interrupt enable status.
+        Callers expect the status to be in X86_EFLAGS_IF, and other bits
+        may be set in the return value.  We take advantage of this by
+        making sure that X86_EFLAGS_IF has the right value (and other bits
+        in that byte are 0), but other bits in the return value are
+        undefined.  We need to toggle the state of the bit, because
+        Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+        testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+        setz %ah
+        addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+        ret
+        ENDPROC(xen_save_fl_direct)
+        RELOC(xen_save_fl_direct, 0)
+/*
+        In principle the caller should be passing us a value return
+        from xen_save_fl_direct, but for robustness sake we test only
+        the X86_EFLAGS_IF flag rather than the whole byte. After
+        setting the interrupt mask state, it checks for unmasked
+        pending events and enters the hypervisor to get them delivered
+        if so.
+ */
+ENTRY(xen_restore_fl_direct)
+        testb $X86_EFLAGS_IF>>8, %ah
+        setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+        /* Preempt here doesn't matter because that will deal with
+           any pending interrupts.  The pending check may end up being
+           run on the wrong CPU, but that doesn't hurt. */
+        /* check for unmasked and pending */
+        cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+        jz 1f
+2:      call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+        ret
+        ENDPROC(xen_restore_fl_direct)
+        RELOC(xen_restore_fl_direct, 2b+1)
+/*
+        This is run where a normal iret would be run, with the same stack setup:
+              8: eflags
+              4: cs
+        esp-> 0: eip
+        This attempts to make sure that any pending events are dealt
+        with on return to usermode, but there is a small window in
+        which an event can happen just before entering usermode.  If
+        the nested interrupt ends up setting one of the TIF_WORK_MASK
+        pending work flags, they will not be tested again before
+        returning to usermode. This means that a process can end up
+        with pending work, which will be unprocessed until the process
+        enters and leaves the kernel again, which could be an
+        unbounded amount of time.  This means that a pending signal or
+        reschedule event could be indefinitely delayed.
+        The fix is to notice a nested interrupt in the critical
+        window, and if one occurs, then fold the nested interrupt into
+        the current interrupt stack frame, and re-process it
+        iteratively rather than recursively.  This means that it will
+        exit via the normal path, and all pending work will be dealt
+        with appropriately.
+        Because the nested interrupt handler needs to deal with the
+        current stack state in whatever form its in, we keep things
+        simple by only using a single register which is pushed/popped
+        on the stack.
+        Non-direct iret could be done in the same way, but it would
+        require an annoying amount of code duplication.  We'll assume
+        that direct mode will be the common case once the hypervisor
+        support becomes commonplace.
+ */
+ENTRY(xen_iret_direct)
+        /* test eflags for special cases */
+        testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+        jnz hyper_iret
+        push %eax
+        ESP_OFFSET=4    # bytes pushed onto stack
+        /* Store vcpu_info pointer for easy access.  Do it this
+           way to avoid having to reload %fs */
+#ifdef CONFIG_SMP
+        GET_THREAD_INFO(%eax)
+        movl TI_cpu(%eax),%eax
+        movl __per_cpu_offset(,%eax,4),%eax
+        lea per_cpu__xen_vcpu_info(%eax),%eax
+#else
+        movl $per_cpu__xen_vcpu_info, %eax
+#endif
+        /* check IF state we're restoring */
+        testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+        /* Maybe enable events.  Once this happens we could get a
+           recursive event, so the critical region starts immediately
+           afterwards.  However, if that happens we don't end up
+           resuming the code, so we don't have to be worried about
+           being preempted to another CPU. */
+        setz XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+        /* check for unmasked and pending */
+        cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+        /* If there's something pending, mask events again so we
+           can jump back into xen_hypervisor_callback */
+        sete XEN_vcpu_info_mask(%eax)
+        popl %eax
+        /* From this point on the registers are restored and the stack
+           updated, so we don't need to worry about it if we're preempted */
+iret_restore_end:
+        /* Jump to hypervisor_callback after fixing up the stack.
+           Events are masked, so jumping out of the critical
+           region is OK. */
+        je xen_hypervisor_callback
+        iret
+xen_iret_end_crit:
+hyper_iret:
+        /* put this out of line since its very rarely used */
+        jmp hypercall_page + __HYPERVISOR_iret * 32
+        .globl xen_iret_start_crit, xen_iret_end_crit
+/*
+   This is called by xen_hypervisor_callback in entry.S when it sees
+   that the EIP at the time of interrupt was between xen_iret_start_crit
+   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
+   a more refined determination of what to do.
+   The stack format at this point is:
+        ----------------
+         ss             : (ss/esp may be present if we came from usermode)
+         esp            :
+         eflags         }  outer exception info
+         cs             }
+         eip            }
+        ---------------- <- edi (copy dest)
+         eax            :  outer eax if it hasn't been restored
+        ----------------
+         eflags         }  nested exception info
+         cs             }   (no ss/esp because we're nested
+         eip            }    from the same ring)
+         orig_eax       }<- esi (copy src)
+         - - - - - - - -
+         fs             }
+         es             }
+         ds             }  SAVE_ALL state
+         eax            }
+          :             :
+         ebx            }
+        ----------------
+         return addr     <- esp
+        ----------------
+   In order to deliver the nested exception properly, we need to shift
+   everything from the return addr up to the error code so it
+   sits just under the outer exception info.  This means that when we
+   handle the exception, we do it in the context of the outer exception
+   rather than starting a new one.
+   The only caveat is that if the outer eax hasn't been
+   restored yet (ie, it's still on stack), we need to insert
+   its value into the SAVE_ALL state before going on, since
+   it's usermode state which we eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+        /* offsets +4 for return address */
+        /*
+           Paranoia: Make sure we're really coming from userspace.
+           One could imagine a case where userspace jumps into the
+           critical range address, but just before the CPU delivers a GP,
+           it decides to deliver an interrupt instead.  Unlikely?
+           Definitely.  Easy to avoid?  Yes.  The Intel documents
+           explicitly say that the reported EIP for a bad jump is the
+           jump instruction itself, not the destination, but some virtual
+           environments get this wrong.
+         */
+        movl PT_CS+4(%esp), %ecx
+        andl $SEGMENT_RPL_MASK, %ecx
+        cmpl $USER_RPL, %ecx
+        je 2f
+        lea PT_ORIG_EAX+4(%esp), %esi
+        lea PT_EFLAGS+4(%esp), %edi
+        /* If eip is before iret_restore_end then stack
+           hasn't been restored yet. */
+        cmp $iret_restore_end, %eax
+        jae 1f
+        movl 0+4(%edi),%eax             /* copy EAX */
+        movl %eax, PT_EAX+4(%esp)
+        lea ESP_OFFSET(%edi),%edi       /* move dest up over saved regs */
+        /* set up the copy */
+1:      std
+        mov $(PT_EIP+4) / 4, %ecx       /* copy ret+saved regs up to orig_eax */
+        rep movsl
+        cld
+        lea 4(%edi),%esp                /* point esp to new frame */
+2:      ret
+/*
+        Force an event check by making a hypercall,
+        but preserve regs before making the call.
+ */
+check_events:
+        push %eax
+        push %ecx
+        push %edx
+        call force_evtchn_callback
+        pop %edx
+        pop %ecx
+        pop %eax
+        ret
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
new file mode 100644
index 000000000000..f8d6937db2ec
--- /dev/null
+++ b/arch/x86/xen/xen-head.S
@@ -0,0 +1,38 @@
+/* Xen-specific pieces of head.S, intended to be included in the right
+        place in head.S */
+#ifdef CONFIG_XEN
+#include <linux/elfnote.h>
+#include <asm/boot.h>
+#include <xen/interface/elfnote.h>
+.pushsection .init.text
+ENTRY(startup_xen)
+        movl %esi,xen_start_info
+        cld
+        movl $(init_thread_union+THREAD_SIZE),%esp
+        jmp xen_start_kernel
+.popsection
+.pushsection .bss.page_aligned
+        .align PAGE_SIZE_asm
+ENTRY(hypercall_page)
+        .skip 0x1000
+.popsection
+        ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
+        ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
+        ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
+        ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long  __PAGE_OFFSET)
+        ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
+        ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
+        ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+#ifdef CONFIG_X86_PAE
+        ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
+#else
+        ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
+#endif
+        ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
+#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
new file mode 100644
index 000000000000..b9aaea45f07f
--- /dev/null
+++ b/arch/x86/xen/xen-ops.h
@@ -0,0 +1,71 @@
+#ifndef XEN_OPS_H
+#define XEN_OPS_H
+#include <linux/init.h>
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+void xen_copy_trap_info(struct trap_info *traps);
+DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DECLARE_PER_CPU(unsigned long, xen_cr3);
+extern struct start_info *xen_start_info;
+extern struct shared_info *HYPERVISOR_shared_info;
+char * __init xen_memory_setup(void);
+void __init xen_arch_setup(void);
+void __init xen_init_IRQ(void);
+void xen_setup_timer(int cpu);
+void xen_setup_cpu_clockevents(void);
+unsigned long xen_cpu_khz(void);
+void __init xen_time_init(void);
+unsigned long xen_get_wallclock(void);
+int xen_set_wallclock(unsigned long time);
+unsigned long long xen_sched_clock(void);
+void xen_mark_init_mm_pinned(void);
+DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+static inline unsigned xen_get_lazy_mode(void)
+{
+        return x86_read_percpu(xen_lazy_mode);
+}
+void __init xen_fill_possible_map(void);
+void __init xen_setup_vcpu_info_placement(void);
+void xen_smp_prepare_boot_cpu(void);
+void xen_smp_prepare_cpus(unsigned int max_cpus);
+int xen_cpu_up(unsigned int cpu);
+void xen_smp_cpus_done(unsigned int max_cpus);
+void xen_smp_send_stop(void);
+void xen_smp_send_reschedule(int cpu);
+int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+                           int wait);
+int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+                                 int nonatomic, int wait);
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+                               void *info, int wait);
+/* Declare an asm function, along with symbols needed to make it
+   inlineable */
+#define DECL_ASM(ret, name, ...)                \
+        ret name(__VA_ARGS__);                  \
+        extern char name##_end[];               \
+        extern char name##_reloc[]              \
+DECL_ASM(void, xen_irq_enable_direct, void);
+DECL_ASM(void, xen_irq_disable_direct, void);
+DECL_ASM(unsigned long, xen_save_fl_direct, void);
+DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+void xen_iret_direct(void);
+#endif /* XEN_OPS_H */