Linux-2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/x86_64/kernel
53 files changed, 20636 insertions, 0 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
new file mode 100644
index 000000000000..0a3318e08ab6
--- /dev/null
+++ b/arch/x86_64/kernel/Makefile
@@ -0,0 +1,45 @@
+#
+# Makefile for the linux kernel.
+#
+extra-y         := head.o head64.o init_task.o vmlinux.lds
+EXTRA_AFLAGS    := -traditional
+obj-y   := process.o semaphore.o signal.o entry.o traps.o irq.o \
+                ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
+                x8664_ksyms.o i387.o syscall.o vsyscall.o \
+                setup64.o bootflag.o e820.o reboot.o quirks.o
+obj-$(CONFIG_X86_MCE)         += mce.o
+obj-$(CONFIG_X86_MCE_INTEL)     += mce_intel.o
+obj-$(CONFIG_MTRR)              += ../../i386/kernel/cpu/mtrr/
+obj-$(CONFIG_ACPI_BOOT)         += acpi/
+obj-$(CONFIG_X86_MSR)           += msr.o
+obj-$(CONFIG_MICROCODE)         += microcode.o
+obj-$(CONFIG_X86_CPUID)         += cpuid.o
+obj-$(CONFIG_SMP)               += smp.o smpboot.o trampoline.o
+obj-$(CONFIG_X86_LOCAL_APIC)    += apic.o  nmi.o
+obj-$(CONFIG_X86_IO_APIC)       += io_apic.o mpparse.o \
+                genapic.o genapic_cluster.o genapic_flat.o
+obj-$(CONFIG_PM)                += suspend.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)  += suspend_asm.o
+obj-$(CONFIG_CPU_FREQ)          += cpufreq/
+obj-$(CONFIG_EARLY_PRINTK)      += early_printk.o
+obj-$(CONFIG_GART_IOMMU)        += pci-gart.o aperture.o
+obj-$(CONFIG_DUMMY_IOMMU)       += pci-nommu.o pci-dma.o
+obj-$(CONFIG_SWIOTLB)           += swiotlb.o
+obj-$(CONFIG_KPROBES)           += kprobes.o
+obj-$(CONFIG_MODULES)           += module.o
+obj-y                           += topology.o
+obj-y                           += intel_cacheinfo.o
+CFLAGS_vsyscall.o               := $(PROFILING) -g0
+bootflag-y                      += ../../i386/kernel/bootflag.o
+cpuid-$(subst m,y,$(CONFIG_X86_CPUID))  += ../../i386/kernel/cpuid.o
+topology-y                     += ../../i386/mach-default/topology.o
+swiotlb-$(CONFIG_SWIOTLB)      += ../../ia64/lib/swiotlb.o
+microcode-$(subst m,y,$(CONFIG_MICROCODE))  += ../../i386/kernel/microcode.o
+intel_cacheinfo-y               += ../../i386/kernel/cpu/intel_cacheinfo.o
+quirks-y                        += ../../i386/kernel/quirks.o
diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile
new file mode 100644
index 000000000000..d2c2ee5f9a88
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_ACPI_BOOT)         := boot.o
+boot-$(CONFIG_ACPI_BOOT)        := ../../../i386/kernel/acpi/boot.o
+obj-$(CONFIG_ACPI_SLEEP)        += sleep.o wakeup.o
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
new file mode 100644
index 000000000000..7a275de6df22
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -0,0 +1,132 @@
+/*
+ *  acpi.c - Architecture-Specific Low-Level ACPI Support
+ *
+ *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
+ *  Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
+ *  Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
+ *  Copyright (C) 2003 Pavel Machek, SuSE Labs
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/acpi.h>
+#include <asm/mpspec.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+#include <asm/tlbflush.h>
+/* --------------------------------------------------------------------------
+                              Low-Level Sleep Support
+   -------------------------------------------------------------------------- */
+#ifdef CONFIG_ACPI_SLEEP
+/* address in low memory of the wakeup routine. */
+unsigned long acpi_wakeup_address = 0;
+unsigned long acpi_video_flags;
+extern char wakeup_start, wakeup_end;
+extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
+static pgd_t low_ptr;
+static void init_low_mapping(void)
+{
+        pgd_t *slot0 = pgd_offset(current->mm, 0UL);
+        low_ptr = *slot0;
+        set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
+        flush_tlb_all();
+}
+/**
+ * acpi_save_state_mem - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ */
+int acpi_save_state_mem (void)
+{
+        init_low_mapping();
+        memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start);
+        acpi_copy_wakeup_routine(acpi_wakeup_address);
+        return 0;
+}
+/*
+ * acpi_restore_state
+ */
+void acpi_restore_state_mem (void)
+{
+        set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
+        flush_tlb_all();
+}
+/**
+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
+ *
+ * We allocate a page in low memory for the wakeup
+ * routine for when we come back from a sleep state. The
+ * runtime allocator allows specification of <16M pages, but not
+ * <1M pages.
+ */
+void __init acpi_reserve_bootmem(void)
+{
+        acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
+        if ((&wakeup_end - &wakeup_start) > PAGE_SIZE)
+                printk(KERN_CRIT "ACPI: Wakeup code way too big, will crash on attempt to suspend\n");
+}
+static int __init acpi_sleep_setup(char *str)
+{
+        while ((str != NULL) && (*str != '\0')) {
+                if (strncmp(str, "s3_bios", 7) == 0)
+                        acpi_video_flags = 1;
+                if (strncmp(str, "s3_mode", 7) == 0)
+                        acpi_video_flags |= 2;
+                str = strchr(str, ',');
+                if (str != NULL)
+                        str += strspn(str, ", \t");
+        }
+        return 1;
+}
+__setup("acpi_sleep=", acpi_sleep_setup);
+#endif /*CONFIG_ACPI_SLEEP*/
+void acpi_pci_link_exit(void) {}
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
new file mode 100644
index 000000000000..a4c630034cd4
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -0,0 +1,527 @@
+.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+#
+# wakeup_code runs in real mode, and at unknown address (determined at run-time).
+# Therefore it must only use relative jumps/calls. 
+#
+# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled
+#
+# If physical address of wakeup_code is 0x12345, BIOS should call us with
+# cs = 0x1234, eip = 0x05
+#
+ALIGN
+        .align  16
+ENTRY(wakeup_start)
+wakeup_code:
+        wakeup_code_start = .
+        .code16
+# Running in *copy* of this code, somewhere in low 1MB.
+        movb    $0xa1, %al      ;  outb %al, $0x80
+        cli
+        cld
+        # setup data segment
+        movw    %cs, %ax
+        movw    %ax, %ds                                        # Make ds:0 point to wakeup_start
+        movw    %ax, %ss
+        mov     $(wakeup_stack - wakeup_code), %sp              # Private stack is needed for ASUS board
+        pushl   $0                                              # Kill any dangerous flags
+        popfl
+        movl    real_magic - wakeup_code, %eax
+        cmpl    $0x12345678, %eax
+        jne     bogus_real_magic
+        testl   $1, video_flags - wakeup_code
+        jz      1f
+        lcall   $0xc000,$3
+        movw    %cs, %ax
+        movw    %ax, %ds                                        # Bios might have played with that
+        movw    %ax, %ss
+1:
+        testl   $2, video_flags - wakeup_code
+        jz      1f
+        mov     video_mode - wakeup_code, %ax
+        call    mode_seta
+1:
+        movw    $0xb800, %ax
+        movw    %ax,%fs
+        movw    $0x0e00 + 'L', %fs:(0x10)
+        movb    $0xa2, %al      ;  outb %al, $0x80
+        
+        lidt    %ds:idt_48a - wakeup_code
+        xorl    %eax, %eax
+        movw    %ds, %ax                        # (Convert %ds:gdt to a linear ptr)
+        shll    $4, %eax
+        addl    $(gdta - wakeup_code), %eax
+        movl    %eax, gdt_48a +2 - wakeup_code
+        lgdt    %ds:gdt_48a - wakeup_code               # load gdt with whatever is
+                                                # appropriate
+        movl    $1, %eax                        # protected mode (PE) bit
+        lmsw    %ax                             # This is it!
+        jmp     1f
+1:
+        .byte 0x66, 0xea                        # prefix + jmpi-opcode
+        .long   wakeup_32 - __START_KERNEL_map
+        .word   __KERNEL_CS
+        .code32
+wakeup_32:
+# Running in this code, but at low address; paging is not yet turned on.
+        movb    $0xa5, %al      ;  outb %al, $0x80
+        /* Check if extended functions are implemented */               
+        movl    $0x80000000, %eax
+        cpuid
+        cmpl    $0x80000000, %eax
+        jbe     bogus_cpu
+        wbinvd
+        mov     $0x80000001, %eax
+        cpuid
+        btl     $29, %edx
+        jnc     bogus_cpu
+        movl    %edx,%edi
+        
+        movw    $__KERNEL_DS, %ax
+        movw    %ax, %ds
+        movw    %ax, %es
+        movw    %ax, %fs
+        movw    %ax, %gs
+        movw    $__KERNEL_DS, %ax       
+        movw    %ax, %ss
+        mov     $(wakeup_stack - __START_KERNEL_map), %esp
+        movl    saved_magic - __START_KERNEL_map, %eax
+        cmpl    $0x9abcdef0, %eax
+        jne     bogus_32_magic
+        /*
+         * Prepare for entering 64bits mode
+         */
+        /* Enable PAE mode and PGE */
+        xorl    %eax, %eax
+        btsl    $5, %eax
+        btsl    $7, %eax
+        movl    %eax, %cr4
+        /* Setup early boot stage 4 level pagetables */
+        movl    $(wakeup_level4_pgt - __START_KERNEL_map), %eax
+        movl    %eax, %cr3
+        /* Setup EFER (Extended Feature Enable Register) */
+        movl    $MSR_EFER, %ecx
+        rdmsr
+        /* Fool rdmsr and reset %eax to avoid dependences */
+        xorl    %eax, %eax
+        /* Enable Long Mode */
+        btsl    $_EFER_LME, %eax
+        /* Enable System Call */
+        btsl    $_EFER_SCE, %eax
+        /* No Execute supported? */     
+        btl     $20,%edi
+        jnc     1f
+        btsl    $_EFER_NX, %eax
+1:      
+                                
+        /* Make changes effective */
+        wrmsr
+        wbinvd
+        xorl    %eax, %eax
+        btsl    $31, %eax                       /* Enable paging and in turn activate Long Mode */
+        btsl    $0, %eax                        /* Enable protected mode */
+        btsl    $1, %eax                        /* Enable MP */
+        btsl    $4, %eax                        /* Enable ET */
+        btsl    $5, %eax                        /* Enable NE */
+        btsl    $16, %eax                       /* Enable WP */
+        btsl    $18, %eax                       /* Enable AM */
+        /* Make changes effective */
+        movl    %eax, %cr0
+        /* At this point:
+                CR4.PAE must be 1
+                CS.L must be 0
+                CR3 must point to PML4
+                Next instruction must be a branch
+                This must be on identity-mapped page
+        */
+        jmp     reach_compatibility_mode
+reach_compatibility_mode:
+        movw    $0x0e00 + 'i', %ds:(0xb8012)
+        movb    $0xa8, %al      ;  outb %al, $0x80;     
+                
+        /*
+         * At this point we're in long mode but in 32bit compatibility mode
+         * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
+         * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load
+         * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+         */
+        movw    $0x0e00 + 'n', %ds:(0xb8014)
+        movb    $0xa9, %al      ;  outb %al, $0x80
+        
+        /* Load new GDT with the 64bit segment using 32bit descriptor */
+        movl    $(pGDT32 - __START_KERNEL_map), %eax
+        lgdt    (%eax)
+        movl    $(wakeup_jumpvector - __START_KERNEL_map), %eax
+        /* Finally jump in 64bit mode */
+        ljmp    *(%eax)
+wakeup_jumpvector:
+        .long   wakeup_long64 - __START_KERNEL_map
+        .word   __KERNEL_CS
+.code64
+        /*      Hooray, we are in Long 64-bit mode (but still running in low memory) */
+wakeup_long64:
+        /*
+         * We must switch to a new descriptor in kernel space for the GDT
+         * because soon the kernel won't have access anymore to the userspace
+         * addresses where we're currently running on. We have to do that here
+         * because in 32bit we couldn't load a 64bit linear address.
+         */
+        lgdt    cpu_gdt_descr - __START_KERNEL_map
+        movw    $0x0e00 + 'u', %ds:(0xb8016)
+        
+        nop
+        nop
+        movw    $__KERNEL_DS, %ax
+        movw    %ax, %ss        
+        movw    %ax, %ds
+        movw    %ax, %es
+        movw    %ax, %fs
+        movw    %ax, %gs
+        movq    saved_esp, %rsp
+        movw    $0x0e00 + 'x', %ds:(0xb8018)
+        movq    saved_ebx, %rbx
+        movq    saved_edi, %rdi
+        movq    saved_esi, %rsi
+        movq    saved_ebp, %rbp
+        movw    $0x0e00 + '!', %ds:(0xb801a)
+        movq    saved_eip, %rax
+        jmp     *%rax
+.code32
+        .align  64      
+gdta:
+        .word   0, 0, 0, 0                      # dummy
+        .word   0, 0, 0, 0                      # unused
+        .word   0xFFFF                          # 4Gb - (0x100000*0x1000 = 4Gb)
+        .word   0                               # base address = 0
+        .word   0x9B00                          # code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?)
+        .word   0x00CF                          # granularity = 4096, 386
+                                                #  (+5th nibble of limit)
+        .word   0xFFFF                          # 4Gb - (0x100000*0x1000 = 4Gb)
+        .word   0                               # base address = 0
+        .word   0x9200                          # data read/write
+        .word   0x00CF                          # granularity = 4096, 386
+                                                #  (+5th nibble of limit)
+# this is 64bit descriptor for code
+        .word   0xFFFF
+        .word   0
+        .word   0x9A00                          # code read/exec
+        .word   0x00AF                          # as above, but it is long mode and with D=0
+idt_48a:
+        .word   0                               # idt limit = 0
+        .word   0, 0                            # idt base = 0L
+gdt_48a:
+        .word   0x8000                          # gdt limit=2048,
+                                                #  256 GDT entries
+        .word   0, 0                            # gdt base (filled in later)
+        
+        
+real_save_gdt:  .word 0
+                .quad 0
+real_magic:     .quad 0
+video_mode:     .quad 0
+video_flags:    .quad 0
+bogus_real_magic:
+        movb    $0xba,%al       ;  outb %al,$0x80               
+        jmp bogus_real_magic
+bogus_32_magic:
+        movb    $0xb3,%al       ;  outb %al,$0x80
+        jmp bogus_32_magic
+bogus_31_magic:
+        movb    $0xb1,%al       ;  outb %al,$0x80
+        jmp bogus_31_magic
+bogus_cpu:
+        movb    $0xbc,%al       ;  outb %al,$0x80
+        jmp bogus_cpu
+        
+/* This code uses an extended set of video mode numbers. These include:
+ * Aliases for standard modes
+ *      NORMAL_VGA (-1)
+ *      EXTENDED_VGA (-2)
+ *      ASK_VGA (-3)
+ * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
+ * of compatibility when extending the table. These are between 0x00 and 0xff.
+ */
+#define VIDEO_FIRST_MENU 0x0000
+/* Standard BIOS video modes (BIOS number + 0x0100) */
+#define VIDEO_FIRST_BIOS 0x0100
+/* VESA BIOS video modes (VESA number + 0x0200) */
+#define VIDEO_FIRST_VESA 0x0200
+/* Video7 special modes (BIOS number + 0x0900) */
+#define VIDEO_FIRST_V7 0x0900
+# Setting of user mode (AX=mode ID) => CF=success
+mode_seta:
+        movw    %ax, %bx
+#if 0
+        cmpb    $0xff, %ah
+        jz      setalias
+        testb   $VIDEO_RECALC>>8, %ah
+        jnz     _setrec
+        cmpb    $VIDEO_FIRST_RESOLUTION>>8, %ah
+        jnc     setres
+        
+        cmpb    $VIDEO_FIRST_SPECIAL>>8, %ah
+        jz      setspc
+        cmpb    $VIDEO_FIRST_V7>>8, %ah
+        jz      setv7
+#endif
+        
+        cmpb    $VIDEO_FIRST_VESA>>8, %ah
+        jnc     check_vesaa
+#if 0   
+        orb     %ah, %ah
+        jz      setmenu
+#endif
+        
+        decb    %ah
+#       jz      setbios                           Add bios modes later
+setbada:        clc
+        ret
+check_vesaa:
+        subb    $VIDEO_FIRST_VESA>>8, %bh
+        orw     $0x4000, %bx                    # Use linear frame buffer
+        movw    $0x4f02, %ax                    # VESA BIOS mode set call
+        int     $0x10
+        cmpw    $0x004f, %ax                    # AL=4f if implemented
+        jnz     _setbada                                # AH=0 if OK
+        stc
+        ret
+_setbada: jmp setbada
+        .code64
+bogus_magic:
+        movw    $0x0e00 + 'B', %ds:(0xb8018)
+        jmp bogus_magic
+bogus_magic2:
+        movw    $0x0e00 + '2', %ds:(0xb8018)
+        jmp bogus_magic2
+        
+wakeup_stack_begin:     # Stack grows down
+.org    0xff0
+wakeup_stack:           # Just below end of page
+ENTRY(wakeup_end)
+        
+##
+# acpi_copy_wakeup_routine
+#
+# Copy the above routine to low memory.
+#
+# Parameters:
+# %rdi: place to copy wakeup routine to
+#
+# Returned address is location of code in low memory (past data and stack)
+#
+ENTRY(acpi_copy_wakeup_routine)
+        pushq   %rax
+        pushq   %rcx
+        pushq   %rdx
+        sgdt    saved_gdt
+        sidt    saved_idt
+        sldt    saved_ldt
+        str     saved_tss
+        movq    %cr3, %rdx
+        movq    %rdx, saved_cr3
+        movq    %cr4, %rdx
+        movq    %rdx, saved_cr4
+        movq    %cr0, %rdx
+        movq    %rdx, saved_cr0
+        sgdt    real_save_gdt - wakeup_start (,%rdi)
+        movl    $MSR_EFER, %ecx
+        rdmsr
+        movl    %eax, saved_efer
+        movl    %edx, saved_efer2
+        movl    saved_video_mode, %edx
+        movl    %edx, video_mode - wakeup_start (,%rdi)
+        movl    acpi_video_flags, %edx
+        movl    %edx, video_flags - wakeup_start (,%rdi)
+        movq    $0x12345678, real_magic - wakeup_start (,%rdi)
+        movq    $0x123456789abcdef0, %rdx
+        movq    %rdx, saved_magic
+        movl    saved_magic - __START_KERNEL_map, %eax
+        cmpl    $0x9abcdef0, %eax
+        jne     bogus_32_magic
+        # make sure %cr4 is set correctly (features, etc)
+        movl    saved_cr4 - __START_KERNEL_map, %eax
+        movq    %rax, %cr4
+        movl    saved_cr0 - __START_KERNEL_map, %eax
+        movq    %rax, %cr0
+        jmp     1f              # Flush pipelines
+1:
+        # restore the regs we used
+        popq    %rdx
+        popq    %rcx
+        popq    %rax
+ENTRY(do_suspend_lowlevel_s4bios)
+        ret
+        .align 2
+        .p2align 4,,15
+.globl do_suspend_lowlevel
+        .type   do_suspend_lowlevel,@function
+do_suspend_lowlevel:
+.LFB5:
+        subq    $8, %rsp
+        xorl    %eax, %eax
+        call    save_processor_state
+        movq %rsp, saved_context_esp(%rip)
+        movq %rax, saved_context_eax(%rip)
+        movq %rbx, saved_context_ebx(%rip)
+        movq %rcx, saved_context_ecx(%rip)
+        movq %rdx, saved_context_edx(%rip)
+        movq %rbp, saved_context_ebp(%rip)
+        movq %rsi, saved_context_esi(%rip)
+        movq %rdi, saved_context_edi(%rip)
+        movq %r8,  saved_context_r08(%rip)
+        movq %r9,  saved_context_r09(%rip)
+        movq %r10, saved_context_r10(%rip)
+        movq %r11, saved_context_r11(%rip)
+        movq %r12, saved_context_r12(%rip)
+        movq %r13, saved_context_r13(%rip)
+        movq %r14, saved_context_r14(%rip)
+        movq %r15, saved_context_r15(%rip)
+        pushfq ; popq saved_context_eflags(%rip)
+        movq    $.L97, saved_eip(%rip)
+        movq %rsp,saved_esp
+        movq %rbp,saved_ebp
+        movq %rbx,saved_ebx
+        movq %rdi,saved_edi
+        movq %rsi,saved_esi
+        addq    $8, %rsp
+        movl    $3, %edi
+        xorl    %eax, %eax
+        jmp     acpi_enter_sleep_state
+.L97:
+        .p2align 4,,7
+.L99:
+        .align 4
+        movl    $24, %eax
+        movw %ax, %ds
+        movq    saved_context+58(%rip), %rax
+        movq %rax, %cr4
+        movq    saved_context+50(%rip), %rax
+        movq %rax, %cr3
+        movq    saved_context+42(%rip), %rax
+        movq %rax, %cr2
+        movq    saved_context+34(%rip), %rax
+        movq %rax, %cr0
+        pushq saved_context_eflags(%rip) ; popfq
+        movq saved_context_esp(%rip), %rsp
+        movq saved_context_ebp(%rip), %rbp
+        movq saved_context_eax(%rip), %rax
+        movq saved_context_ebx(%rip), %rbx
+        movq saved_context_ecx(%rip), %rcx
+        movq saved_context_edx(%rip), %rdx
+        movq saved_context_esi(%rip), %rsi
+        movq saved_context_edi(%rip), %rdi
+        movq saved_context_r08(%rip), %r8
+        movq saved_context_r09(%rip), %r9
+        movq saved_context_r10(%rip), %r10
+        movq saved_context_r11(%rip), %r11
+        movq saved_context_r12(%rip), %r12
+        movq saved_context_r13(%rip), %r13
+        movq saved_context_r14(%rip), %r14
+        movq saved_context_r15(%rip), %r15
+        xorl    %eax, %eax
+        addq    $8, %rsp
+        jmp     restore_processor_state
+.LFE5:
+.Lfe5:
+        .size   do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel
+        
+.data
+ALIGN
+ENTRY(saved_ebp)        .quad   0
+ENTRY(saved_esi)        .quad   0
+ENTRY(saved_edi)        .quad   0
+ENTRY(saved_ebx)        .quad   0
+ENTRY(saved_eip)        .quad   0
+ENTRY(saved_esp)        .quad   0
+ENTRY(saved_magic)      .quad   0
+ALIGN
+# saved registers
+saved_gdt:      .quad   0,0
+saved_idt:      .quad   0,0
+saved_ldt:      .quad   0
+saved_tss:      .quad   0
+saved_cr0:      .quad 0
+saved_cr3:      .quad 0
+saved_cr4:      .quad 0
+saved_efer:     .quad 0
+saved_efer2:    .quad 0
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
new file mode 100644
index 000000000000..4baa99fe1e5c
--- /dev/null
+++ b/arch/x86_64/kernel/aperture.c
@@ -0,0 +1,286 @@
+/* 
+ * Firmware replacement code.
+ * 
+ * Work around broken BIOSes that don't set an aperture or only set the
+ * aperture in the AGP bridge. 
+ * If all fails map the aperture over some low memory.  This is cheaper than 
+ * doing bounce buffering. The memory is lost. This is done at early boot 
+ * because only the bootmem allocator can allocate 32+MB. 
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+#include <asm/pci-direct.h>
+int iommu_aperture;
+int iommu_aperture_disabled __initdata = 0;
+int iommu_aperture_allowed __initdata = 0;
+int fallback_aper_order __initdata = 1; /* 64MB */
+int fallback_aper_force __initdata = 0; 
+int fix_aperture __initdata = 1;
+/* This code runs before the PCI subsystem is initialized, so just 
+   access the northbridge directly. */
+#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16))
+static u32 __init allocate_aperture(void) 
+{
+#ifdef CONFIG_DISCONTIGMEM
+        pg_data_t *nd0 = NODE_DATA(0);
+#else
+        pg_data_t *nd0 = &contig_page_data;
+#endif  
+        u32 aper_size;
+        void *p; 
+        if (fallback_aper_order > 7) 
+                fallback_aper_order = 7; 
+        aper_size = (32 * 1024 * 1024) << fallback_aper_order; 
+        /* 
+         * Aperture has to be naturally aligned. This means an 2GB aperture won't 
+         * have much chances to find a place in the lower 4GB of memory. 
+         * Unfortunately we cannot move it up because that would make the 
+         * IOMMU useless.
+         */
+        p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); 
+        if (!p || __pa(p)+aper_size > 0xffffffff) {
+                printk("Cannot allocate aperture memory hole (%p,%uK)\n",
+                       p, aper_size>>10);
+                if (p)
+                        free_bootmem_node(nd0, (unsigned long)p, aper_size); 
+                return 0;
+        }
+        printk("Mapping aperture over %d KB of RAM @ %lx\n",  
+               aper_size >> 10, __pa(p)); 
+        return (u32)__pa(p); 
+}
+static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) 
+{ 
+        if (!aper_base) 
+                return 0;
+        if (aper_size < 64*1024*1024) { 
+                printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20); 
+                return 0;
+        }
+        if (aper_base + aper_size >= 0xffffffff) { 
+                printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
+                return 0; 
+        }
+        if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) {  
+                printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
+                return 0; 
+        } 
+        return 1;
+} 
+/* Find a PCI capability */ 
+static __u32 __init find_cap(int num, int slot, int func, int cap) 
+{ 
+        u8 pos;
+        int bytes;
+        if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
+                return 0;
+        pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
+        for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 
+                u8 id;
+                pos &= ~3; 
+                id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
+                if (id == 0xff)
+                        break;
+                if (id == cap) 
+                        return pos; 
+                pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); 
+        } 
+        return 0;
+} 
+/* Read a standard AGPv3 bridge header */
+static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
+{ 
+        u32 apsize;
+        u32 apsizereg;
+        int nbits;
+        u32 aper_low, aper_hi;
+        u64 aper;
+        printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
+        apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
+        if (apsizereg == 0xffffffff) {
+                printk("APSIZE in AGP bridge unreadable\n");
+                return 0;
+        }
+        apsize = apsizereg & 0xfff;
+        /* Some BIOS use weird encodings not in the AGPv3 table. */
+        if (apsize & 0xff) 
+                apsize |= 0xf00; 
+        nbits = hweight16(apsize);
+        *order = 7 - nbits;
+        if ((int)*order < 0) /* < 32MB */
+                *order = 0;
+        
+        aper_low = read_pci_config(num,slot,func, 0x10);
+        aper_hi = read_pci_config(num,slot,func,0x14);
+        aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+        printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 
+               aper, 32 << *order, apsizereg);
+        if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order))
+            return 0;
+        return (u32)aper; 
+} 
+/* Look for an AGP bridge. Windows only expects the aperture in the
+   AGP bridge and some BIOS forget to initialize the Northbridge too.
+   Work around this here. 
+   Do an PCI bus scan by hand because we're running before the PCI
+   subsystem. 
+   All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+   generically. It's probably overkill to always scan all slots because
+   the AGP bridges should be always an own bus on the HT hierarchy, 
+   but do it here for future safety. */
+static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+{
+        int num, slot, func;
+        /* Poor man's PCI discovery */
+        for (num = 0; num < 32; num++) { 
+                for (slot = 0; slot < 32; slot++) { 
+                        for (func = 0; func < 8; func++) { 
+                                u32 class, cap;
+                                u8 type;
+                                class = read_pci_config(num,slot,func,
+                                                        PCI_CLASS_REVISION);
+                                if (class == 0xffffffff)
+                                        break; 
+                                
+                                switch (class >> 16) { 
+                                case PCI_CLASS_BRIDGE_HOST:
+                                case PCI_CLASS_BRIDGE_OTHER: /* needed? */
+                                        /* AGP bridge? */
+                                        cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
+                                        if (!cap)
+                                                break;
+                                        *valid_agp = 1; 
+                                        return read_agp(num,slot,func,cap,order);
+                                } 
+                                
+                                /* No multi-function device? */
+                                type = read_pci_config_byte(num,slot,func,
+                                                               PCI_HEADER_TYPE);
+                                if (!(type & 0x80))
+                                        break;
+                        } 
+                } 
+        }
+        printk("No AGP bridge found\n"); 
+        return 0;
+}
+void __init iommu_hole_init(void) 
+{ 
+        int fix, num; 
+        u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0;
+        u64 aper_base, last_aper_base = 0;
+        int valid_agp = 0;
+        if (iommu_aperture_disabled || !fix_aperture)
+                return;
+        printk("Checking aperture...\n"); 
+        fix = 0;
+        for (num = 24; num < 32; num++) {               
+                char name[30];
+                if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) 
+                        continue;       
+                iommu_aperture = 1; 
+                aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 
+                aper_size = (32 * 1024 * 1024) << aper_order; 
+                aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
+                aper_base <<= 25; 
+                printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, 
+                       aper_base, aper_size>>20);
+                
+                sprintf(name, "northbridge cpu %d", num-24); 
+                if (!aperture_valid(name, aper_base, aper_size)) { 
+                        fix = 1; 
+                        break; 
+                }
+                if ((last_aper_order && aper_order != last_aper_order) ||
+                    (last_aper_base && aper_base != last_aper_base)) {
+                        fix = 1;
+                        break;
+                }
+                last_aper_order = aper_order;
+                last_aper_base = aper_base;
+        } 
+        if (!fix && !fallback_aper_force) 
+                return; 
+        if (!fallback_aper_force)
+                aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 
+                
+        if (aper_alloc) { 
+                /* Got the aperture from the AGP bridge */
+        } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) ||
+                   force_iommu ||
+                   valid_agp ||
+                   fallback_aper_force) { 
+                printk("Your BIOS doesn't leave a aperture memory hole\n");
+                printk("Please enable the IOMMU option in the BIOS setup\n");
+                printk("This costs you %d MB of RAM\n",
+                       32 << fallback_aper_order);
+                aper_order = fallback_aper_order;
+                aper_alloc = allocate_aperture();
+                if (!aper_alloc) { 
+                        /* Could disable AGP and IOMMU here, but it's probably
+                           not worth it. But the later users cannot deal with
+                           bad apertures and turning on the aperture over memory
+                           causes very strange problems, so it's better to 
+                           panic early. */
+                        panic("Not enough memory for aperture");
+                }
+        } else { 
+                return; 
+        } 
+        /* Fix up the north bridges */
+        for (num = 24; num < 32; num++) {               
+                if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) 
+                        continue;       
+                /* Don't enable translation yet. That is done later. 
+                   Assume this BIOS didn't initialise the GART so 
+                   just overwrite all previous bits */ 
+                write_pci_config(0, num, 3, 0x90, aper_order<<1); 
+                write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 
+        } 
+} 
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
new file mode 100644
index 000000000000..9826f03dc9a3
--- /dev/null
+++ b/arch/x86_64/kernel/apic.c
@@ -0,0 +1,1088 @@
+/*
+ *      Local APIC handling, local APIC timers
+ *
+ *      (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *      Fixes
+ *      Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
+ *                                      thanks to Eric Gilmore
+ *                                      and Rolf G. Tews
+ *                                      for testing these extensively.
+ *      Maciej W. Rozycki       :       Various updates and fixes.
+ *      Mikael Pettersson       :       Power Management for UP-APIC.
+ *      Pavel Machek and
+ *      Mikael Pettersson       :       PM converted to driver model.
+ */
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <asm/atomic.h>
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/mach_apic.h>
+int apic_verbosity;
+int disable_apic_timer __initdata;
+/* Using APIC to generate smp_local_timer_interrupt? */
+int using_apic_timer = 0;
+static DEFINE_PER_CPU(int, prof_multiplier) = 1;
+static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
+static DEFINE_PER_CPU(int, prof_counter) = 1;
+static void apic_pm_activate(void);
+void enable_NMI_through_LVT0 (void * dummy)
+{
+        unsigned int v, ver;
+        
+        ver = apic_read(APIC_LVR);
+        ver = GET_APIC_VERSION(ver);
+        v = APIC_DM_NMI;                        /* unmask and set to NMI */
+        apic_write_around(APIC_LVT0, v);
+}
+int get_maxlvt(void)
+{
+        unsigned int v, ver, maxlvt;
+        v = apic_read(APIC_LVR);
+        ver = GET_APIC_VERSION(v);
+        maxlvt = GET_APIC_MAXLVT(v);
+        return maxlvt;
+}
+void clear_local_APIC(void)
+{
+        int maxlvt;
+        unsigned int v;
+        maxlvt = get_maxlvt();
+        /*
+         * Masking an LVT entry on a P6 can trigger a local APIC error
+         * if the vector is zero. Mask LVTERR first to prevent this.
+         */
+        if (maxlvt >= 3) {
+                v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
+                apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
+        }
+        /*
+         * Careful: we have to set masks only first to deassert
+         * any level-triggered sources.
+         */
+        v = apic_read(APIC_LVTT);
+        apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+        v = apic_read(APIC_LVT0);
+        apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+        v = apic_read(APIC_LVT1);
+        apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
+        if (maxlvt >= 4) {
+                v = apic_read(APIC_LVTPC);
+                apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
+        }
+        /*
+         * Clean APIC state for other OSs:
+         */
+        apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
+        apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+        apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
+        if (maxlvt >= 3)
+                apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
+        if (maxlvt >= 4)
+                apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
+        v = GET_APIC_VERSION(apic_read(APIC_LVR));
+        if (APIC_INTEGRATED(v)) {       /* !82489DX */
+                if (maxlvt > 3)         /* Due to Pentium errata 3AP and 11AP. */
+                        apic_write(APIC_ESR, 0);
+                apic_read(APIC_ESR);
+        }
+}
+void __init connect_bsp_APIC(void)
+{
+        if (pic_mode) {
+                /*
+                 * Do not trust the local APIC being empty at bootup.
+                 */
+                clear_local_APIC();
+                /*
+                 * PIC mode, enable APIC mode in the IMCR, i.e.
+                 * connect BSP's local APIC to INT and NMI lines.
+                 */
+                apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n");
+                outb(0x70, 0x22);
+                outb(0x01, 0x23);
+        }
+}
+void disconnect_bsp_APIC(void)
+{
+        if (pic_mode) {
+                /*
+                 * Put the board back into PIC mode (has an effect
+                 * only on certain older boards).  Note that APIC
+                 * interrupts, including IPIs, won't work beyond
+                 * this point!  The only exception are INIT IPIs.
+                 */
+                apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n");
+                outb(0x70, 0x22);
+                outb(0x00, 0x23);
+        }
+}
+void disable_local_APIC(void)
+{
+        unsigned int value;
+        clear_local_APIC();
+        /*
+         * Disable APIC (implies clearing of registers
+         * for 82489DX!).
+         */
+        value = apic_read(APIC_SPIV);
+        value &= ~APIC_SPIV_APIC_ENABLED;
+        apic_write_around(APIC_SPIV, value);
+}
+/*
+ * This is to verify that we're looking at a real local APIC.
+ * Check these against your board if the CPUs aren't getting
+ * started for no apparent reason.
+ */
+int __init verify_local_APIC(void)
+{
+        unsigned int reg0, reg1;
+        /*
+         * The version register is read-only in a real APIC.
+         */
+        reg0 = apic_read(APIC_LVR);
+        apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
+        apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
+        reg1 = apic_read(APIC_LVR);
+        apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
+        /*
+         * The two version reads above should print the same
+         * numbers.  If the second one is different, then we
+         * poke at a non-APIC.
+         */
+        if (reg1 != reg0)
+                return 0;
+        /*
+         * Check if the version looks reasonably.
+         */
+        reg1 = GET_APIC_VERSION(reg0);
+        if (reg1 == 0x00 || reg1 == 0xff)
+                return 0;
+        reg1 = get_maxlvt();
+        if (reg1 < 0x02 || reg1 == 0xff)
+                return 0;
+        /*
+         * The ID register is read/write in a real APIC.
+         */
+        reg0 = apic_read(APIC_ID);
+        apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+        apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
+        reg1 = apic_read(APIC_ID);
+        apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
+        apic_write(APIC_ID, reg0);
+        if (reg1 != (reg0 ^ APIC_ID_MASK))
+                return 0;
+        /*
+         * The next two are just to see if we have sane values.
+         * They're only really relevant if we're in Virtual Wire
+         * compatibility mode, but most boxes are anymore.
+         */
+        reg0 = apic_read(APIC_LVT0);
+        apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0);
+        reg1 = apic_read(APIC_LVT1);
+        apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
+        return 1;
+}
+void __init sync_Arb_IDs(void)
+{
+        /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
+        unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+        if (ver >= 0x14)        /* P4 or higher */
+                return;
+        /*
+         * Wait for idle.
+         */
+        apic_wait_icr_idle();
+        apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
+        apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
+                                | APIC_DM_INIT);
+}
+extern void __error_in_apic_c (void);
+/*
+ * An initial setup of the virtual wire mode.
+ */
+void __init init_bsp_APIC(void)
+{
+        unsigned int value, ver;
+        /*
+         * Don't do the setup now if we have a SMP BIOS as the
+         * through-I/O-APIC virtual wire mode might be active.
+         */
+        if (smp_found_config || !cpu_has_apic)
+                return;
+        value = apic_read(APIC_LVR);
+        ver = GET_APIC_VERSION(value);
+        /*
+         * Do not trust the local APIC being empty at bootup.
+         */
+        clear_local_APIC();
+        /*
+         * Enable APIC.
+         */
+        value = apic_read(APIC_SPIV);
+        value &= ~APIC_VECTOR_MASK;
+        value |= APIC_SPIV_APIC_ENABLED;
+        value |= APIC_SPIV_FOCUS_DISABLED;
+        value |= SPURIOUS_APIC_VECTOR;
+        apic_write_around(APIC_SPIV, value);
+        /*
+         * Set up the virtual wire mode.
+         */
+        apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+        value = APIC_DM_NMI;
+        if (!APIC_INTEGRATED(ver))              /* 82489DX */
+                value |= APIC_LVT_LEVEL_TRIGGER;
+        apic_write_around(APIC_LVT1, value);
+}
+void __init setup_local_APIC (void)
+{
+        unsigned int value, ver, maxlvt;
+        /* Pound the ESR really hard over the head with a big hammer - mbligh */
+        if (esr_disable) {
+                apic_write(APIC_ESR, 0);
+                apic_write(APIC_ESR, 0);
+                apic_write(APIC_ESR, 0);
+                apic_write(APIC_ESR, 0);
+        }
+        value = apic_read(APIC_LVR);
+        ver = GET_APIC_VERSION(value);
+        if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
+                __error_in_apic_c();
+        /*
+         * Double-check whether this APIC is really registered.
+         * This is meaningless in clustered apic mode, so we skip it.
+         */
+        if (!apic_id_registered())
+                BUG();
+        /*
+         * Intel recommends to set DFR, LDR and TPR before enabling
+         * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
+         * document number 292116).  So here it goes...
+         */
+        init_apic_ldr();
+        /*
+         * Set Task Priority to 'accept all'. We never change this
+         * later on.
+         */
+        value = apic_read(APIC_TASKPRI);
+        value &= ~APIC_TPRI_MASK;
+        apic_write_around(APIC_TASKPRI, value);
+        /*
+         * Now that we are all set up, enable the APIC
+         */
+        value = apic_read(APIC_SPIV);
+        value &= ~APIC_VECTOR_MASK;
+        /*
+         * Enable APIC
+         */
+        value |= APIC_SPIV_APIC_ENABLED;
+        /*
+         * Some unknown Intel IO/APIC (or APIC) errata is biting us with
+         * certain networking cards. If high frequency interrupts are
+         * happening on a particular IOAPIC pin, plus the IOAPIC routing
+         * entry is masked/unmasked at a high rate as well then sooner or
+         * later IOAPIC line gets 'stuck', no more interrupts are received
+         * from the device. If focus CPU is disabled then the hang goes
+         * away, oh well :-(
+         *
+         * [ This bug can be reproduced easily with a level-triggered
+         *   PCI Ne2000 networking cards and PII/PIII processors, dual
+         *   BX chipset. ]
+         */
+        /*
+         * Actually disabling the focus CPU check just makes the hang less
+         * frequent as it makes the interrupt distributon model be more
+         * like LRU than MRU (the short-term load is more even across CPUs).
+         * See also the comment in end_level_ioapic_irq().  --macro
+         */
+#if 1
+        /* Enable focus processor (bit==0) */
+        value &= ~APIC_SPIV_FOCUS_DISABLED;
+#else
+        /* Disable focus processor (bit==1) */
+        value |= APIC_SPIV_FOCUS_DISABLED;
+#endif
+        /*
+         * Set spurious IRQ vector
+         */
+        value |= SPURIOUS_APIC_VECTOR;
+        apic_write_around(APIC_SPIV, value);
+        /*
+         * Set up LVT0, LVT1:
+         *
+         * set up through-local-APIC on the BP's LINT0. This is not
+         * strictly necessary in pure symmetric-IO mode, but sometimes
+         * we delegate interrupts to the 8259A.
+         */
+        /*
+         * TODO: set up through-local-APIC from through-I/O-APIC? --macro
+         */
+        value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
+        if (!smp_processor_id() && (pic_mode || !value)) {
+                value = APIC_DM_EXTINT;
+                apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
+        } else {
+                value = APIC_DM_EXTINT | APIC_LVT_MASKED;
+                apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id());
+        }
+        apic_write_around(APIC_LVT0, value);
+        /*
+         * only the BP should see the LINT1 NMI signal, obviously.
+         */
+        if (!smp_processor_id())
+                value = APIC_DM_NMI;
+        else
+                value = APIC_DM_NMI | APIC_LVT_MASKED;
+        if (!APIC_INTEGRATED(ver))              /* 82489DX */
+                value |= APIC_LVT_LEVEL_TRIGGER;
+        apic_write_around(APIC_LVT1, value);
+        if (APIC_INTEGRATED(ver) && !esr_disable) {             /* !82489DX */
+                unsigned oldvalue;
+                maxlvt = get_maxlvt();
+                if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
+                        apic_write(APIC_ESR, 0);
+                oldvalue = apic_read(APIC_ESR);
+                value = ERROR_APIC_VECTOR;      // enables sending errors
+                apic_write_around(APIC_LVTERR, value);
+                /*
+                 * spec says clear errors after enabling vector.
+                 */
+                if (maxlvt > 3)
+                        apic_write(APIC_ESR, 0);
+                value = apic_read(APIC_ESR);
+                if (value != oldvalue)
+                        apic_printk(APIC_VERBOSE,
+                        "ESR value after enabling vector: %08x, after %08x\n",
+                        oldvalue, value);
+        } else {
+                if (esr_disable)        
+                        /* 
+                         * Something untraceble is creating bad interrupts on 
+                         * secondary quads ... for the moment, just leave the
+                         * ESR disabled - we can't do anything useful with the
+                         * errors anyway - mbligh
+                         */
+                        apic_printk(APIC_DEBUG, "Leaving ESR disabled.\n");
+                else 
+                        apic_printk(APIC_DEBUG, "No ESR for 82489DX.\n");
+        }
+        nmi_watchdog_default();
+        if (nmi_watchdog == NMI_LOCAL_APIC)
+                setup_apic_nmi_watchdog();
+        apic_pm_activate();
+}
+#ifdef CONFIG_PM
+static struct {
+        /* 'active' is true if the local APIC was enabled by us and
+           not the BIOS; this signifies that we are also responsible
+           for disabling it before entering apm/acpi suspend */
+        int active;
+        /* r/w apic fields */
+        unsigned int apic_id;
+        unsigned int apic_taskpri;
+        unsigned int apic_ldr;
+        unsigned int apic_dfr;
+        unsigned int apic_spiv;
+        unsigned int apic_lvtt;
+        unsigned int apic_lvtpc;
+        unsigned int apic_lvt0;
+        unsigned int apic_lvt1;
+        unsigned int apic_lvterr;
+        unsigned int apic_tmict;
+        unsigned int apic_tdcr;
+        unsigned int apic_thmr;
+} apic_pm_state;
+static int lapic_suspend(struct sys_device *dev, u32 state)
+{
+        unsigned long flags;
+        if (!apic_pm_state.active)
+                return 0;
+        apic_pm_state.apic_id = apic_read(APIC_ID);
+        apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+        apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+        apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+        apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+        apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+        apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+        apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+        apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+        apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+        apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+        apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+        apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+        local_save_flags(flags);
+        local_irq_disable();
+        disable_local_APIC();
+        local_irq_restore(flags);
+        return 0;
+}
+static int lapic_resume(struct sys_device *dev)
+{
+        unsigned int l, h;
+        unsigned long flags;
+        if (!apic_pm_state.active)
+                return 0;
+        /* XXX: Pavel needs this for S3 resume, but can't explain why */
+        set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
+        local_irq_save(flags);
+        rdmsr(MSR_IA32_APICBASE, l, h);
+        l &= ~MSR_IA32_APICBASE_BASE;
+        l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+        wrmsr(MSR_IA32_APICBASE, l, h);
+        apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+        apic_write(APIC_ID, apic_pm_state.apic_id);
+        apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+        apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+        apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+        apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+        apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+        apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+        apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+        apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+        apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+        apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+        apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+        apic_write(APIC_ESR, 0);
+        apic_read(APIC_ESR);
+        apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+        apic_write(APIC_ESR, 0);
+        apic_read(APIC_ESR);
+        local_irq_restore(flags);
+        return 0;
+}
+static struct sysdev_class lapic_sysclass = {
+        set_kset_name("lapic"),
+        .resume         = lapic_resume,
+        .suspend        = lapic_suspend,
+};
+static struct sys_device device_lapic = {
+        .id             = 0,
+        .cls            = &lapic_sysclass,
+};
+static void __init apic_pm_activate(void)
+{
+        apic_pm_state.active = 1;
+}
+static int __init init_lapic_sysfs(void)
+{
+        int error;
+        if (!cpu_has_apic)
+                return 0;
+        /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+        error = sysdev_class_register(&lapic_sysclass);
+        if (!error)
+                error = sysdev_register(&device_lapic);
+        return error;
+}
+device_initcall(init_lapic_sysfs);
+#else   /* CONFIG_PM */
+static void apic_pm_activate(void) { }
+#endif  /* CONFIG_PM */
+static int __init apic_set_verbosity(char *str)
+{
+        if (strcmp("debug", str) == 0)
+                apic_verbosity = APIC_DEBUG;
+        else if (strcmp("verbose", str) == 0)
+                apic_verbosity = APIC_VERBOSE;
+        else
+                printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+                                " use apic=verbose or apic=debug", str);
+        return 0;
+}
+__setup("apic=", apic_set_verbosity);
+/*
+ * Detect and enable local APICs on non-SMP boards.
+ * Original code written by Keir Fraser.
+ * On AMD64 we trust the BIOS - if it says no APIC it is likely
+ * not correctly set up (usually the APIC timer won't work etc.) 
+ */
+static int __init detect_init_APIC (void)
+{
+        if (!cpu_has_apic) {
+                printk(KERN_INFO "No local APIC present\n");
+                return -1;
+        }
+        mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+        boot_cpu_id = 0;
+        return 0;
+}
+void __init init_apic_mappings(void)
+{
+        unsigned long apic_phys;
+        /*
+         * If no local APIC can be found then set up a fake all
+         * zeroes page to simulate the local APIC and another
+         * one for the IO-APIC.
+         */
+        if (!smp_found_config && detect_init_APIC()) {
+                apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
+                apic_phys = __pa(apic_phys);
+        } else
+                apic_phys = mp_lapic_addr;
+        set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+        apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
+        /*
+         * Fetch the APIC ID of the BSP in case we have a
+         * default configuration (or the MP table is broken).
+         */
+        if (boot_cpu_id == -1U)
+                boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
+#ifdef CONFIG_X86_IO_APIC
+        {
+                unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+                int i;
+                for (i = 0; i < nr_ioapics; i++) {
+                        if (smp_found_config) {
+                                ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+                        } else {
+                                ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
+                                ioapic_phys = __pa(ioapic_phys);
+                        }
+                        set_fixmap_nocache(idx, ioapic_phys);
+                        apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
+                                        __fix_to_virt(idx), ioapic_phys);
+                        idx++;
+                }
+        }
+#endif
+}
+/*
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
+ */
+#define APIC_DIVISOR 16
+static void __setup_APIC_LVTT(unsigned int clocks)
+{
+        unsigned int lvtt_value, tmp_value, ver;
+        ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+        lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
+        if (!APIC_INTEGRATED(ver))
+                lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+        apic_write_around(APIC_LVTT, lvtt_value);
+        /*
+         * Divide PICLK by 16
+         */
+        tmp_value = apic_read(APIC_TDCR);
+        apic_write_around(APIC_TDCR, (tmp_value
+                                & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+                                | APIC_TDR_DIV_16);
+        apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+}
+static void setup_APIC_timer(unsigned int clocks)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        /* For some reasons this doesn't work on Simics, so fake it for now */ 
+        if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) { 
+        __setup_APIC_LVTT(clocks);
+                return;
+        } 
+        /* wait for irq slice */
+        if (vxtime.hpet_address) {
+                int trigger = hpet_readl(HPET_T0_CMP);
+                while (hpet_readl(HPET_COUNTER) >= trigger)
+                        /* do nothing */ ;
+                while (hpet_readl(HPET_COUNTER) <  trigger)
+                        /* do nothing */ ;
+        } else {
+                int c1, c2;
+                outb_p(0x00, 0x43);
+                c2 = inb_p(0x40);
+                c2 |= inb_p(0x40) << 8;
+        do {
+                        c1 = c2;
+                        outb_p(0x00, 0x43);
+                        c2 = inb_p(0x40);
+                        c2 |= inb_p(0x40) << 8;
+                } while (c2 - c1 < 300);
+        }
+        __setup_APIC_LVTT(clocks);
+        local_irq_restore(flags);
+}
+/*
+ * In this function we calibrate APIC bus clocks to the external
+ * timer. Unfortunately we cannot use jiffies and the timer irq
+ * to calibrate, since some later bootup code depends on getting
+ * the first irq? Ugh.
+ *
+ * We want to do the calibration only once since we
+ * want to have local timer irqs syncron. CPUs connected
+ * by the same APIC bus have the very same bus frequency.
+ * And we want to have irqs off anyways, no accidental
+ * APIC irq that way.
+ */
+#define TICK_COUNT 100000000
+static int __init calibrate_APIC_clock(void)
+{
+        int apic, apic_start, tsc, tsc_start;
+        int result;
+        /*
+         * Put whatever arbitrary (but long enough) timeout
+         * value into the APIC clock, we just want to get the
+         * counter running for calibration.
+         */
+        __setup_APIC_LVTT(1000000000);
+        apic_start = apic_read(APIC_TMCCT);
+        rdtscl(tsc_start);
+        do {
+                apic = apic_read(APIC_TMCCT);
+                rdtscl(tsc);
+        } while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT);
+        result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start);
+        printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
+                result / 1000 / 1000, result / 1000 % 1000);
+        return result * APIC_DIVISOR / HZ;
+}
+static unsigned int calibration_result;
+void __init setup_boot_APIC_clock (void)
+{
+        if (disable_apic_timer) { 
+                printk(KERN_INFO "Disabling APIC timer\n"); 
+                return; 
+        } 
+        printk(KERN_INFO "Using local APIC timer interrupts.\n");
+        using_apic_timer = 1;
+        local_irq_disable();
+        calibration_result = calibrate_APIC_clock();
+        /*
+         * Now set up the timer for real.
+         */
+        setup_APIC_timer(calibration_result);
+        local_irq_enable();
+}
+void __init setup_secondary_APIC_clock(void)
+{
+        local_irq_disable(); /* FIXME: Do we need this? --RR */
+        setup_APIC_timer(calibration_result);
+        local_irq_enable();
+}
+void __init disable_APIC_timer(void)
+{
+        if (using_apic_timer) {
+                unsigned long v;
+                v = apic_read(APIC_LVTT);
+                apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+        }
+}
+void enable_APIC_timer(void)
+{
+        if (using_apic_timer) {
+                unsigned long v;
+                v = apic_read(APIC_LVTT);
+                apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
+        }
+}
+/*
+ * the frequency of the profiling timer can be changed
+ * by writing a multiplier value into /proc/profile.
+ */
+int setup_profiling_timer(unsigned int multiplier)
+{
+        int i;
+        /*
+         * Sanity check. [at least 500 APIC cycles should be
+         * between APIC interrupts as a rule of thumb, to avoid
+         * irqs flooding us]
+         */
+        if ( (!multiplier) || (calibration_result/multiplier < 500))
+                return -EINVAL;
+        /* 
+         * Set the new multiplier for each CPU. CPUs don't start using the
+         * new values until the next timer interrupt in which they do process
+         * accounting. At that time they also adjust their APIC timers
+         * accordingly.
+         */
+        for (i = 0; i < NR_CPUS; ++i)
+                per_cpu(prof_multiplier, i) = multiplier;
+        return 0;
+}
+#undef APIC_DIVISOR
+/*
+ * Local timer interrupt handler. It does both profiling and
+ * process statistics/rescheduling.
+ *
+ * We do profiling in every local tick, statistics/rescheduling
+ * happen only every 'profiling multiplier' ticks. The default
+ * multiplier is 1 and it can be changed by writing the new multiplier
+ * value into /proc/profile.
+ */
+void smp_local_timer_interrupt(struct pt_regs *regs)
+{
+        int cpu = smp_processor_id();
+        profile_tick(CPU_PROFILING, regs);
+        if (--per_cpu(prof_counter, cpu) <= 0) {
+                /*
+                 * The multiplier may have changed since the last time we got
+                 * to this point as a result of the user writing to
+                 * /proc/profile. In this case we need to adjust the APIC
+                 * timer accordingly.
+                 *
+                 * Interrupts are already masked off at this point.
+                 */
+                per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
+                if (per_cpu(prof_counter, cpu) != 
+                    per_cpu(prof_old_multiplier, cpu)) {
+                        __setup_APIC_LVTT(calibration_result/
+                                        per_cpu(prof_counter, cpu));
+                        per_cpu(prof_old_multiplier, cpu) =
+                                per_cpu(prof_counter, cpu);
+                }
+#ifdef CONFIG_SMP
+                update_process_times(user_mode(regs));
+#endif
+        }
+        /*
+         * We take the 'long' return path, and there every subsystem
+         * grabs the appropriate locks (kernel lock/ irq lock).
+         *
+         * we might want to decouple profiling from the 'long path',
+         * and do the profiling totally in assembly.
+         *
+         * Currently this isn't too much of an issue (performance wise),
+         * we can take more than 100K local irqs per second on a 100 MHz P5.
+         */
+}
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ *   interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+void smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+        /*
+         * the NMI deadlock-detector uses this.
+         */
+        add_pda(apic_timer_irqs, 1);
+        /*
+         * NOTE! We'd better ACK the irq immediately,
+         * because timer handling can be slow.
+         */
+        ack_APIC_irq();
+        /*
+         * update_process_times() expects us to have done irq_enter().
+         * Besides, if we don't timer interrupts ignore the global
+         * interrupt lock, which is the WrongThing (tm) to do.
+         */
+        irq_enter();
+        smp_local_timer_interrupt(regs);
+        irq_exit();
+}
+/*
+ * oem_force_hpet_timer -- force HPET mode for some boxes.
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ *
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis. Use available data to take a good guess.
+ * If in doubt, go HPET.
+ */
+__init int oem_force_hpet_timer(void)
+{
+        int i, clusters, zeros;
+        unsigned id;
+        DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
+        bitmap_empty(clustermap, NUM_APIC_CLUSTERS);
+        for (i = 0; i < NR_CPUS; i++) {
+                id = bios_cpu_apicid[i];
+                if (id != BAD_APICID)
+                        __set_bit(APIC_CLUSTERID(id), clustermap);
+        }
+        /* Problem:  Partially populated chassis may not have CPUs in some of
+         * the APIC clusters they have been allocated.  Only present CPUs have
+         * bios_cpu_apicid entries, thus causing zeroes in the bitmap.  Since
+         * clusters are allocated sequentially, count zeros only if they are
+         * bounded by ones.
+         */
+        clusters = 0;
+        zeros = 0;
+        for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+                if (test_bit(i, clustermap)) {
+                        clusters += 1 + zeros;
+                        zeros = 0;
+                } else
+                        ++zeros;
+        }
+        /*
+         * If clusters > 2, then should be multi-chassis.  Return 1 for HPET.
+         * Else return 0 to use TSC.
+         * May have to revisit this when multi-core + hyperthreaded CPUs come
+         * out, but AFAIK this will work even for them.
+         */
+        return (clusters > 2);
+}
+/*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+asmlinkage void smp_spurious_interrupt(void)
+{
+        unsigned int v;
+        irq_enter();
+        /*
+         * Check if this really is a spurious interrupt and ACK it
+         * if it is a vectored one.  Just in case...
+         * Spurious interrupts should not be ACKed.
+         */
+        v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
+        if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
+                ack_APIC_irq();
+#if 0
+        static unsigned long last_warning; 
+        static unsigned long skipped; 
+        /* see sw-dev-man vol 3, chapter 7.4.13.5 */
+        if (time_before(last_warning+30*HZ,jiffies)) { 
+                printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
+                       smp_processor_id(), skipped);
+                last_warning = jiffies; 
+                skipped = 0;
+        } else { 
+                skipped++; 
+        } 
+#endif 
+        irq_exit();
+}
+/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+asmlinkage void smp_error_interrupt(void)
+{
+        unsigned int v, v1;
+        irq_enter();
+        /* First tickle the hardware, only then report what went on. -- REW */
+        v = apic_read(APIC_ESR);
+        apic_write(APIC_ESR, 0);
+        v1 = apic_read(APIC_ESR);
+        ack_APIC_irq();
+        atomic_inc(&irq_err_count);
+        /* Here is what the APIC error bits mean:
+           0: Send CS error
+           1: Receive CS error
+           2: Send accept error
+           3: Receive accept error
+           4: Reserved
+           5: Send illegal vector
+           6: Received illegal vector
+           7: Illegal register address
+        */
+        printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
+                smp_processor_id(), v , v1);
+        irq_exit();
+}
+int disable_apic; 
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor (void)
+{
+        if (disable_apic) { 
+                printk(KERN_INFO "Apic disabled\n");
+                return -1; 
+        }
+        if (!cpu_has_apic) { 
+                disable_apic = 1;
+                printk(KERN_INFO "Apic disabled by BIOS\n");
+                return -1;
+        }
+        verify_local_APIC();
+        connect_bsp_APIC();
+        phys_cpu_present_map = physid_mask_of_physid(0);
+        apic_write_around(APIC_ID, boot_cpu_id);
+        setup_local_APIC();
+#ifdef CONFIG_X86_IO_APIC
+        if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+                        setup_IO_APIC();
+        else
+                nr_ioapics = 0;
+#endif
+        setup_boot_APIC_clock();
+        return 0;
+}
+static __init int setup_disableapic(char *str) 
+{ 
+        disable_apic = 1;
+        return 0;
+} 
+static __init int setup_nolapic(char *str) 
+{ 
+        disable_apic = 1;
+        return 0;
+} 
+static __init int setup_noapictimer(char *str) 
+{ 
+        disable_apic_timer = 1;
+        return 0;
+} 
+/* dummy parsing: see setup.c */
+__setup("disableapic", setup_disableapic); 
+__setup("nolapic", setup_nolapic);  /* same as disableapic, for compatibility */
+__setup("noapictimer", setup_noapictimer); 
+/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c
new file mode 100644
index 000000000000..35b4c3fcbb37
--- /dev/null
+++ b/arch/x86_64/kernel/asm-offsets.c
@@ -0,0 +1,69 @@
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#include <linux/sched.h> 
+#include <linux/stddef.h>
+#include <linux/errno.h> 
+#include <linux/hardirq.h>
+#include <linux/suspend.h>
+#include <asm/pda.h>
+#include <asm/processor.h>
+#include <asm/segment.h>
+#include <asm/thread_info.h>
+#include <asm/ia32.h>
+#define DEFINE(sym, val) \
+        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+#define BLANK() asm volatile("\n->" : : )
+int main(void)
+{
+#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
+        ENTRY(state);
+        ENTRY(flags); 
+        ENTRY(thread); 
+        ENTRY(pid);
+        BLANK();
+#undef ENTRY
+#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry))
+        ENTRY(flags);
+        ENTRY(addr_limit);
+        ENTRY(preempt_count);
+        BLANK();
+#undef ENTRY
+#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
+        ENTRY(kernelstack); 
+        ENTRY(oldrsp); 
+        ENTRY(pcurrent); 
+        ENTRY(irqrsp);
+        ENTRY(irqcount);
+        ENTRY(cpunumber);
+        ENTRY(irqstackptr);
+        BLANK();
+#undef ENTRY
+#ifdef CONFIG_IA32_EMULATION
+#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
+        ENTRY(eax);
+        ENTRY(ebx);
+        ENTRY(ecx);
+        ENTRY(edx);
+        ENTRY(esi);
+        ENTRY(edi);
+        ENTRY(ebp);
+        ENTRY(esp);
+        ENTRY(eip);
+        BLANK();
+#undef ENTRY
+        DEFINE(IA32_RT_SIGFRAME_sigcontext,
+               offsetof (struct rt_sigframe32, uc.uc_mcontext));
+        BLANK();
+#endif
+        DEFINE(pbe_address, offsetof(struct pbe, address));
+        DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
+        DEFINE(pbe_next, offsetof(struct pbe, next));
+        return 0;
+}
diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig
new file mode 100644
index 000000000000..81f1562e5393
--- /dev/null
+++ b/arch/x86_64/kernel/cpufreq/Kconfig
@@ -0,0 +1,96 @@
+#
+# CPU Frequency scaling
+#
+menu "CPU Frequency scaling"
+source "drivers/cpufreq/Kconfig"
+if CPU_FREQ
+comment "CPUFreq processor drivers"
+config X86_POWERNOW_K8
+        tristate "AMD Opteron/Athlon64 PowerNow!"
+        select CPU_FREQ_TABLE
+        help
+          This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors.
+          For details, take a look at <file:Documentation/cpu-freq/>. 
+          If in doubt, say N.
+config X86_POWERNOW_K8_ACPI
+        bool
+        depends on X86_POWERNOW_K8 && ACPI_PROCESSOR
+        depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m)
+        default y
+config X86_SPEEDSTEP_CENTRINO
+        tristate "Intel Enhanced SpeedStep"
+        select CPU_FREQ_TABLE
+        depends on ACPI_PROCESSOR
+        help
+          This adds the CPUFreq driver for Enhanced SpeedStep enabled
+          mobile CPUs.  This means Intel Pentium M (Centrino) CPUs
+          or 64bit enabled Intel Xeons.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+config X86_SPEEDSTEP_CENTRINO_ACPI
+        bool
+        depends on X86_SPEEDSTEP_CENTRINO
+        default y
+config X86_ACPI_CPUFREQ
+        tristate "ACPI Processor P-States driver"
+        depends on ACPI_PROCESSOR
+        help
+          This driver adds a CPUFreq driver which utilizes the ACPI
+          Processor Performance States.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          If in doubt, say N.
+comment "shared options"
+config X86_ACPI_CPUFREQ_PROC_INTF
+        bool "/proc/acpi/processor/../performance interface (deprecated)"
+        depends on PROC_FS
+        depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K8_ACPI
+        help
+          This enables the deprecated /proc/acpi/processor/../performance
+          interface. While it is helpful for debugging, the generic,
+          cross-architecture cpufreq interfaces should be used.
+          If in doubt, say N.
+config X86_P4_CLOCKMOD
+        tristate "Intel Pentium 4 clock modulation"
+        depends on EMBEDDED
+        help
+          This adds the clock modulation driver for Intel Pentium 4 / XEON
+          processors.  When enabled it will lower CPU temperature by skipping
+          clocks.
+          This driver should be only used in exceptional
+          circumstances when very low power is needed because it causes severe
+          slowdowns and noticeable latencies.  Normally Speedstep should be used
+          instead.
+          For details, take a look at <file:Documentation/cpu-freq/>.
+          Unless you are absolutely sure say N.
+config X86_SPEEDSTEP_LIB
+        tristate
+        default X86_P4_CLOCKMOD
+endif
+endmenu
diff --git a/arch/x86_64/kernel/cpufreq/Makefile b/arch/x86_64/kernel/cpufreq/Makefile
new file mode 100644
index 000000000000..d8b593879224
--- /dev/null
+++ b/arch/x86_64/kernel/cpufreq/Makefile
@@ -0,0 +1,17 @@
+#
+# Reuse the i386 cpufreq drivers
+#
+SRCDIR := ../../../i386/kernel/cpu/cpufreq
+obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
+obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
+obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
+obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
+powernow-k8-objs := ${SRCDIR}/powernow-k8.o
+speedstep-centrino-objs := ${SRCDIR}/speedstep-centrino.o
+acpi-cpufreq-objs := ${SRCDIR}/acpi-cpufreq.o
+p4-clockmod-objs := ${SRCDIR}/p4-clockmod.o
+speedstep-lib-objs := ${SRCDIR}/speedstep-lib.o
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
new file mode 100644
index 000000000000..657003e461e6
--- /dev/null
+++ b/arch/x86_64/kernel/e820.c
@@ -0,0 +1,513 @@
+/* 
+ * Handle the memory map.
+ * The functions here do the job until bootmem takes over.
+ * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/bootsetup.h>
+extern char _end[];
+/* 
+ * PFN of last memory page.
+ */
+unsigned long end_pfn; 
+/* 
+ * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
+ * The direct mapping extends to end_pfn_map, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */ 
+unsigned long end_pfn_map; 
+/* 
+ * Last pfn which the user wants to use.
+ */
+unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;  
+extern struct resource code_resource, data_resource;
+/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 
+static inline int bad_addr(unsigned long *addrp, unsigned long size)
+{ 
+        unsigned long addr = *addrp, last = addr + size; 
+        /* various gunk below that needed for SMP startup */
+        if (addr < 0x8000) { 
+                *addrp = 0x8000;
+                return 1; 
+        }
+        /* direct mapping tables of the kernel */
+        if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { 
+                *addrp = table_end << PAGE_SHIFT; 
+                return 1;
+        } 
+        /* initrd */ 
+#ifdef CONFIG_BLK_DEV_INITRD
+        if (LOADER_TYPE && INITRD_START && last >= INITRD_START && 
+            addr < INITRD_START+INITRD_SIZE) { 
+                *addrp = INITRD_START + INITRD_SIZE; 
+                return 1;
+        } 
+#endif
+        /* kernel code + 640k memory hole (later should not be needed, but 
+           be paranoid for now) */
+        if (last >= 640*1024 && addr < __pa_symbol(&_end)) { 
+                *addrp = __pa_symbol(&_end);
+                return 1;
+        }
+        /* XXX ramdisk image here? */ 
+        return 0;
+} 
+int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) 
+{ 
+        int i;
+        for (i = 0; i < e820.nr_map; i++) { 
+                struct e820entry *ei = &e820.map[i]; 
+                if (type && ei->type != type) 
+                        continue;
+                if (ei->addr >= end || ei->addr + ei->size < start) 
+                        continue; 
+                return 1; 
+        } 
+        return 0;
+}
+/* 
+ * Find a free area in a specific range. 
+ */ 
+unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 
+{ 
+        int i; 
+        for (i = 0; i < e820.nr_map; i++) { 
+                struct e820entry *ei = &e820.map[i]; 
+                unsigned long addr = ei->addr, last; 
+                if (ei->type != E820_RAM) 
+                        continue; 
+                if (addr < start) 
+                        addr = start;
+                if (addr > ei->addr + ei->size) 
+                        continue; 
+                while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size)
+                        ;
+                last = addr + size;
+                if (last > ei->addr + ei->size)
+                        continue;
+                if (last > end) 
+                        continue;
+                return addr; 
+        } 
+        return -1UL;            
+} 
+/* 
+ * Free bootmem based on the e820 table for a node.
+ */
+void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
+{
+        int i;
+        for (i = 0; i < e820.nr_map; i++) {
+                struct e820entry *ei = &e820.map[i]; 
+                unsigned long last, addr;
+                if (ei->type != E820_RAM || 
+                    ei->addr+ei->size <= start || 
+                    ei->addr > end)
+                        continue;
+                addr = round_up(ei->addr, PAGE_SIZE);
+                if (addr < start) 
+                        addr = start;
+                last = round_down(ei->addr + ei->size, PAGE_SIZE); 
+                if (last >= end)
+                        last = end; 
+                if (last > addr && last-addr >= PAGE_SIZE)
+                        free_bootmem_node(pgdat, addr, last-addr);
+        }
+}
+/*
+ * Find the highest page frame number we have available
+ */
+unsigned long __init e820_end_of_ram(void)
+{
+        int i;
+        unsigned long end_pfn = 0;
+        
+        for (i = 0; i < e820.nr_map; i++) {
+                struct e820entry *ei = &e820.map[i]; 
+                unsigned long start, end;
+                start = round_up(ei->addr, PAGE_SIZE); 
+                end = round_down(ei->addr + ei->size, PAGE_SIZE); 
+                if (start >= end)
+                        continue;
+                if (ei->type == E820_RAM) { 
+                if (end > end_pfn<<PAGE_SHIFT)
+                        end_pfn = end>>PAGE_SHIFT;
+                } else { 
+                        if (end > end_pfn_map<<PAGE_SHIFT) 
+                                end_pfn_map = end>>PAGE_SHIFT;
+                } 
+        }
+        if (end_pfn > end_pfn_map) 
+                end_pfn_map = end_pfn;
+        if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
+                end_pfn_map = MAXMEM>>PAGE_SHIFT;
+        if (end_pfn > end_user_pfn)
+                end_pfn = end_user_pfn;
+        if (end_pfn > end_pfn_map) 
+                end_pfn = end_pfn_map; 
+        return end_pfn; 
+}
+/* 
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+void __init e820_reserve_resources(void)
+{
+        int i;
+        for (i = 0; i < e820.nr_map; i++) {
+                struct resource *res;
+                if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
+                        continue;
+                res = alloc_bootmem_low(sizeof(struct resource));
+                switch (e820.map[i].type) {
+                case E820_RAM:  res->name = "System RAM"; break;
+                case E820_ACPI: res->name = "ACPI Tables"; break;
+                case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
+                default:        res->name = "reserved";
+                }
+                res->start = e820.map[i].addr;
+                res->end = res->start + e820.map[i].size - 1;
+                res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+                request_resource(&iomem_resource, res);
+                if (e820.map[i].type == E820_RAM) {
+                        /*
+                         *  We don't know which RAM region contains kernel data,
+                         *  so we try it repeatedly and let the resource manager
+                         *  test it.
+                         */
+                        request_resource(res, &code_resource);
+                        request_resource(res, &data_resource);
+                }
+        }
+}
+/* 
+ * Add a memory region to the kernel e820 map.
+ */ 
+void __init add_memory_region(unsigned long start, unsigned long size, int type)
+{
+        int x = e820.nr_map;
+        if (x == E820MAX) {
+                printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+                return;
+        }
+        e820.map[x].addr = start;
+        e820.map[x].size = size;
+        e820.map[x].type = type;
+        e820.nr_map++;
+}
+void __init e820_print_map(char *who)
+{
+        int i;
+        for (i = 0; i < e820.nr_map; i++) {
+                printk(" %s: %016Lx - %016Lx ", who,
+                        (unsigned long long) e820.map[i].addr,
+                        (unsigned long long) (e820.map[i].addr + e820.map[i].size));
+                switch (e820.map[i].type) {
+                case E820_RAM:  printk("(usable)\n");
+                                break;
+                case E820_RESERVED:
+                                printk("(reserved)\n");
+                                break;
+                case E820_ACPI:
+                                printk("(ACPI data)\n");
+                                break;
+                case E820_NVS:
+                                printk("(ACPI NVS)\n");
+                                break;
+                default:        printk("type %u\n", e820.map[i].type);
+                                break;
+                }
+        }
+}
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries.  The following 
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+{
+        struct change_member {
+                struct e820entry *pbios; /* pointer to original bios entry */
+                unsigned long long addr; /* address for this change point */
+        };
+        static struct change_member change_point_list[2*E820MAX] __initdata;
+        static struct change_member *change_point[2*E820MAX] __initdata;
+        static struct e820entry *overlap_list[E820MAX] __initdata;
+        static struct e820entry new_bios[E820MAX] __initdata;
+        struct change_member *change_tmp;
+        unsigned long current_type, last_type;
+        unsigned long long last_addr;
+        int chgidx, still_changing;
+        int overlap_entries;
+        int new_bios_entry;
+        int old_nr, new_nr;
+        int i;
+        /*
+                Visually we're performing the following (1,2,3,4 = memory types)...
+                Sample memory map (w/overlaps):
+                   ____22__________________
+                   ______________________4_
+                   ____1111________________
+                   _44_____________________
+                   11111111________________
+                   ____________________33__
+                   ___________44___________
+                   __________33333_________
+                   ______________22________
+                   ___________________2222_
+                   _________111111111______
+                   _____________________11_
+                   _________________4______
+                Sanitized equivalent (no overlap):
+                   1_______________________
+                   _44_____________________
+                   ___1____________________
+                   ____22__________________
+                   ______11________________
+                   _________1______________
+                   __________3_____________
+                   ___________44___________
+                   _____________33_________
+                   _______________2________
+                   ________________1_______
+                   _________________4______
+                   ___________________2____
+                   ____________________33__
+                   ______________________4_
+        */
+        /* if there's only one memory region, don't bother */
+        if (*pnr_map < 2)
+                return -1;
+        old_nr = *pnr_map;
+        /* bail out if we find any unreasonable addresses in bios map */
+        for (i=0; i<old_nr; i++)
+                if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+                        return -1;
+        /* create pointers for initial change-point information (for sorting) */
+        for (i=0; i < 2*old_nr; i++)
+                change_point[i] = &change_point_list[i];
+        /* record all known change-points (starting and ending addresses) */
+        chgidx = 0;
+        for (i=0; i < old_nr; i++)      {
+                change_point[chgidx]->addr = biosmap[i].addr;
+                change_point[chgidx++]->pbios = &biosmap[i];
+                change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+                change_point[chgidx++]->pbios = &biosmap[i];
+        }
+        /* sort change-point list by memory addresses (low -> high) */
+        still_changing = 1;
+        while (still_changing)  {
+                still_changing = 0;
+                for (i=1; i < 2*old_nr; i++)  {
+                        /* if <current_addr> > <last_addr>, swap */
+                        /* or, if current=<start_addr> & last=<end_addr>, swap */
+                        if ((change_point[i]->addr < change_point[i-1]->addr) ||
+                                ((change_point[i]->addr == change_point[i-1]->addr) &&
+                                 (change_point[i]->addr == change_point[i]->pbios->addr) &&
+                                 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
+                           )
+                        {
+                                change_tmp = change_point[i];
+                                change_point[i] = change_point[i-1];
+                                change_point[i-1] = change_tmp;
+                                still_changing=1;
+                        }
+                }
+        }
+        /* create a new bios memory map, removing overlaps */
+        overlap_entries=0;       /* number of entries in the overlap table */
+        new_bios_entry=0;        /* index for creating new bios map entries */
+        last_type = 0;           /* start with undefined memory type */
+        last_addr = 0;           /* start with 0 as last starting address */
+        /* loop through change-points, determining affect on the new bios map */
+        for (chgidx=0; chgidx < 2*old_nr; chgidx++)
+        {
+                /* keep track of all overlapping bios entries */
+                if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
+                {
+                        /* add map entry to overlap list (> 1 entry implies an overlap) */
+                        overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
+                }
+                else
+                {
+                        /* remove entry from list (order independent, so swap with last) */
+                        for (i=0; i<overlap_entries; i++)
+                        {
+                                if (overlap_list[i] == change_point[chgidx]->pbios)
+                                        overlap_list[i] = overlap_list[overlap_entries-1];
+                        }
+                        overlap_entries--;
+                }
+                /* if there are overlapping entries, decide which "type" to use */
+                /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+                current_type = 0;
+                for (i=0; i<overlap_entries; i++)
+                        if (overlap_list[i]->type > current_type)
+                                current_type = overlap_list[i]->type;
+                /* continue building up new bios map based on this information */
+                if (current_type != last_type)  {
+                        if (last_type != 0)      {
+                                new_bios[new_bios_entry].size =
+                                        change_point[chgidx]->addr - last_addr;
+                                /* move forward only if the new size was non-zero */
+                                if (new_bios[new_bios_entry].size != 0)
+                                        if (++new_bios_entry >= E820MAX)
+                                                break;  /* no more space left for new bios entries */
+                        }
+                        if (current_type != 0)  {
+                                new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+                                new_bios[new_bios_entry].type = current_type;
+                                last_addr=change_point[chgidx]->addr;
+                        }
+                        last_type = current_type;
+                }
+        }
+        new_nr = new_bios_entry;   /* retain count for new bios entries */
+        /* copy new bios mapping into original location */
+        memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+        *pnr_map = new_nr;
+        return 0;
+}
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ *
+ * We check to see that the memory map contains at least 2 elements
+ * before we'll use it, because the detection code in setup.S may
+ * not be perfect and most every PC known to man has two memory
+ * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
+ * thinkpad 560x, for example, does not cooperate with the memory
+ * detection code.)
+ */
+static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+{
+        /* Only one memory region (or negative)? Ignore it */
+        if (nr_map < 2)
+                return -1;
+        do {
+                unsigned long start = biosmap->addr;
+                unsigned long size = biosmap->size;
+                unsigned long end = start + size;
+                unsigned long type = biosmap->type;
+                /* Overflow in 64 bits? Ignore the memory map. */
+                if (start > end)
+                        return -1;
+                /*
+                 * Some BIOSes claim RAM in the 640k - 1M region.
+                 * Not right. Fix it up.
+                 * 
+                 * This should be removed on Hammer which is supposed to not
+                 * have non e820 covered ISA mappings there, but I had some strange
+                 * problems so it stays for now.  -AK
+                 */
+                if (type == E820_RAM) {
+                        if (start < 0x100000ULL && end > 0xA0000ULL) {
+                                if (start < 0xA0000ULL)
+                                        add_memory_region(start, 0xA0000ULL-start, type);
+                                if (end <= 0x100000ULL)
+                                        continue;
+                                start = 0x100000ULL;
+                                size = end - start;
+                        }
+                }
+                add_memory_region(start, size, type);
+        } while (biosmap++,--nr_map);
+        return 0;
+}
+void __init setup_memory_region(void)
+{
+        char *who = "BIOS-e820";
+        /*
+         * Try to copy the BIOS-supplied E820-map.
+         *
+         * Otherwise fake a memory map; one section from 0k->640k,
+         * the next section from 1mb->appropriate_mem_k
+         */
+        sanitize_e820_map(E820_MAP, &E820_MAP_NR);
+        if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
+                unsigned long mem_size;
+                /* compare results from other methods and take the greater */
+                if (ALT_MEM_K < EXT_MEM_K) {
+                        mem_size = EXT_MEM_K;
+                        who = "BIOS-88";
+                } else {
+                        mem_size = ALT_MEM_K;
+                        who = "BIOS-e801";
+                }
+                e820.nr_map = 0;
+                add_memory_region(0, LOWMEMSIZE(), E820_RAM);
+                add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+        }
+        printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+        e820_print_map(who);
+}
+void __init parse_memopt(char *p, char **from) 
+{ 
+        end_user_pfn = memparse(p, from);
+        end_user_pfn >>= PAGE_SHIFT;    
+} 
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
new file mode 100644
index 000000000000..750bcd0655dc
--- /dev/null
+++ b/arch/x86_64/kernel/early_printk.c
@@ -0,0 +1,220 @@
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+/* Simple VGA output */
+#ifdef __i386__
+#define VGABASE         (__ISA_IO_base + 0xb8000)
+#else
+#define VGABASE         ((void __iomem *)0xffffffff800b8000UL)
+#endif
+#define MAX_YPOS        25
+#define MAX_XPOS        80
+static int current_ypos = 1, current_xpos = 0; 
+static void early_vga_write(struct console *con, const char *str, unsigned n)
+{
+        char c;
+        int  i, k, j;
+        while ((c = *str++) != '\0' && n-- > 0) {
+                if (current_ypos >= MAX_YPOS) {
+                        /* scroll 1 line up */
+                        for (k = 1, j = 0; k < MAX_YPOS; k++, j++) {
+                                for (i = 0; i < MAX_XPOS; i++) {
+                                        writew(readw(VGABASE + 2*(MAX_XPOS*k + i)),
+                                               VGABASE + 2*(MAX_XPOS*j + i));
+                                }
+                        }
+                        for (i = 0; i < MAX_XPOS; i++)
+                                writew(0x720, VGABASE + 2*(MAX_XPOS*j + i));
+                        current_ypos = MAX_YPOS-1;
+                }
+                if (c == '\n') {
+                        current_xpos = 0;
+                        current_ypos++;
+                } else if (c != '\r')  {
+                        writew(((0x7 << 8) | (unsigned short) c),
+                               VGABASE + 2*(MAX_XPOS*current_ypos +
+                                                current_xpos++));
+                        if (current_xpos >= MAX_XPOS) {
+                                current_xpos = 0;
+                                current_ypos++;
+                        }
+                }
+        }
+}
+static struct console early_vga_console = {
+        .name =         "earlyvga",
+        .write =        early_vga_write,
+        .flags =        CON_PRINTBUFFER,
+        .index =        -1,
+};
+/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ 
+int early_serial_base = 0x3f8;  /* ttyS0 */ 
+#define XMTRDY          0x20
+#define DLAB            0x80
+#define TXR             0       /*  Transmit register (WRITE) */
+#define RXR             0       /*  Receive register  (READ)  */
+#define IER             1       /*  Interrupt Enable          */
+#define IIR             2       /*  Interrupt ID              */
+#define FCR             2       /*  FIFO control              */
+#define LCR             3       /*  Line control              */
+#define MCR             4       /*  Modem control             */
+#define LSR             5       /*  Line Status               */
+#define MSR             6       /*  Modem Status              */
+#define DLL             0       /*  Divisor Latch Low         */
+#define DLH             1       /*  Divisor latch High        */
+static int early_serial_putc(unsigned char ch) 
+{ 
+        unsigned timeout = 0xffff; 
+        while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 
+                cpu_relax();
+        outb(ch, early_serial_base + TXR);
+        return timeout ? 0 : -1;
+} 
+static void early_serial_write(struct console *con, const char *s, unsigned n)
+{
+        while (*s && n-- > 0) { 
+                early_serial_putc(*s); 
+                if (*s == '\n') 
+                        early_serial_putc('\r'); 
+                s++; 
+        } 
+} 
+#define DEFAULT_BAUD 9600
+static __init void early_serial_init(char *s)
+{
+        unsigned char c; 
+        unsigned divisor;
+        unsigned baud = DEFAULT_BAUD;
+        char *e;
+        if (*s == ',')
+                ++s;
+        if (*s) {
+                unsigned port; 
+                if (!strncmp(s,"0x",2)) {
+                        early_serial_base = simple_strtoul(s, &e, 16);
+                } else {
+                        static int bases[] = { 0x3f8, 0x2f8 };
+                        if (!strncmp(s,"ttyS",4))
+                                s += 4;
+                        port = simple_strtoul(s, &e, 10);
+                        if (port > 1 || s == e)
+                                port = 0;
+                        early_serial_base = bases[port];
+                }
+                s += strcspn(s, ",");
+                if (*s == ',')
+                        s++;
+        }
+        outb(0x3, early_serial_base + LCR);     /* 8n1 */
+        outb(0, early_serial_base + IER);       /* no interrupt */
+        outb(0, early_serial_base + FCR);       /* no fifo */
+        outb(0x3, early_serial_base + MCR);     /* DTR + RTS */
+        if (*s) {
+                baud = simple_strtoul(s, &e, 0); 
+                if (baud == 0 || s == e) 
+                        baud = DEFAULT_BAUD;
+        } 
+        
+        divisor = 115200 / baud; 
+        c = inb(early_serial_base + LCR); 
+        outb(c | DLAB, early_serial_base + LCR); 
+        outb(divisor & 0xff, early_serial_base + DLL); 
+        outb((divisor >> 8) & 0xff, early_serial_base + DLH); 
+        outb(c & ~DLAB, early_serial_base + LCR);
+}
+static struct console early_serial_console = {
+        .name =         "earlyser",
+        .write =        early_serial_write,
+        .flags =        CON_PRINTBUFFER,
+        .index =        -1,
+};
+/* Direct interface for emergencies */
+struct console *early_console = &early_vga_console;
+static int early_console_initialized = 0;
+void early_printk(const char *fmt, ...)
+{ 
+        char buf[512]; 
+        int n; 
+        va_list ap;
+        va_start(ap,fmt); 
+        n = vscnprintf(buf,512,fmt,ap);
+        early_console->write(early_console,buf,n);
+        va_end(ap); 
+} 
+static int keep_early; 
+int __init setup_early_printk(char *opt) 
+{  
+        char *space;
+        char buf[256]; 
+        if (early_console_initialized)
+                return -1;
+        opt = strchr(opt, '=') + 1;
+        strlcpy(buf,opt,sizeof(buf)); 
+        space = strchr(buf, ' '); 
+        if (space)
+                *space = 0; 
+        if (strstr(buf,"keep"))
+                keep_early = 1; 
+        if (!strncmp(buf, "serial", 6)) { 
+                early_serial_init(buf + 6);
+                early_console = &early_serial_console;
+        } else if (!strncmp(buf, "ttyS", 4)) { 
+                early_serial_init(buf);
+                early_console = &early_serial_console;          
+        } else if (!strncmp(buf, "vga", 3)) {
+                early_console = &early_vga_console; 
+        }
+        early_console_initialized = 1;
+        register_console(early_console);       
+        return 0;
+}
+void __init disable_early_printk(void)
+{ 
+        if (!early_console_initialized || !early_console)
+                return;
+        if (!keep_early) {
+                printk("disabling early console\n");
+                unregister_console(early_console);
+                early_console_initialized = 0;
+        } else { 
+                printk("keeping early console\n");
+        }
+} 
+__setup("earlyprintk=", setup_early_printk);
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
new file mode 100644
index 000000000000..e126284db7a8
--- /dev/null
+++ b/arch/x86_64/kernel/entry.S
@@ -0,0 +1,920 @@
+/*
+ *  linux/arch/x86_64/entry.S
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
+ *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ * 
+ *  $Id$
+ */
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after an interrupt and after each system call.
+ * 
+ * Normal syscalls and interrupts don't save a full stack frame, this is 
+ * only done for syscall tracing, signals or fork/exec et.al.
+ * 
+ * A note on terminology:        
+ * - top of stack: Architecture defined interrupt frame from SS to RIP 
+ * at the top of the kernel process stack.      
+ * - partial stack frame: partially saved registers upto R11.
+ * - full stack frame: Like partial stack frame, but all register saved. 
+ *      
+ * TODO:         
+ * - schedule it carefully for the final hardware.
+ */
+#define ASSEMBLY 1
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/offset.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+        .code64
+#ifdef CONFIG_PREEMPT
+#define preempt_stop cli
+#else
+#define preempt_stop
+#define retint_kernel retint_restore_args
+#endif  
+        
+/*
+ * C code is not supposed to know about undefined top of stack. Every time 
+ * a C function with an pt_regs argument is called from the SYSCALL based 
+ * fast path FIXUP_TOP_OF_STACK is needed.
+ * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
+ * manipulation.
+ */             
+                
+        /* %rsp:at FRAMEEND */ 
+        .macro FIXUP_TOP_OF_STACK tmp
+        movq    %gs:pda_oldrsp,\tmp
+        movq    \tmp,RSP(%rsp)
+        movq    $__USER_DS,SS(%rsp)
+        movq    $__USER_CS,CS(%rsp)
+        movq    $-1,RCX(%rsp)
+        movq    R11(%rsp),\tmp  /* get eflags */
+        movq    \tmp,EFLAGS(%rsp)
+        .endm
+        .macro RESTORE_TOP_OF_STACK tmp,offset=0
+        movq   RSP-\offset(%rsp),\tmp
+        movq   \tmp,%gs:pda_oldrsp
+        movq   EFLAGS-\offset(%rsp),\tmp
+        movq   \tmp,R11-\offset(%rsp)
+        .endm
+        .macro FAKE_STACK_FRAME child_rip
+        /* push in order ss, rsp, eflags, cs, rip */
+        xorq %rax, %rax
+        pushq %rax /* ss */
+        CFI_ADJUST_CFA_OFFSET   8
+        pushq %rax /* rsp */
+        CFI_ADJUST_CFA_OFFSET   8
+        CFI_OFFSET      rip,0
+        pushq $(1<<9) /* eflags - interrupts on */
+        CFI_ADJUST_CFA_OFFSET   8
+        pushq $__KERNEL_CS /* cs */
+        CFI_ADJUST_CFA_OFFSET   8
+        pushq \child_rip /* rip */
+        CFI_ADJUST_CFA_OFFSET   8
+        CFI_OFFSET      rip,0
+        pushq   %rax /* orig rax */
+        CFI_ADJUST_CFA_OFFSET   8
+        .endm
+        .macro UNFAKE_STACK_FRAME
+        addq $8*6, %rsp
+        CFI_ADJUST_CFA_OFFSET   -(6*8)
+        .endm
+        .macro  CFI_DEFAULT_STACK
+        CFI_ADJUST_CFA_OFFSET  (SS)
+        CFI_OFFSET      r15,R15-SS
+        CFI_OFFSET      r14,R14-SS
+        CFI_OFFSET      r13,R13-SS
+        CFI_OFFSET      r12,R12-SS
+        CFI_OFFSET      rbp,RBP-SS
+        CFI_OFFSET      rbx,RBX-SS
+        CFI_OFFSET      r11,R11-SS
+        CFI_OFFSET      r10,R10-SS
+        CFI_OFFSET      r9,R9-SS
+        CFI_OFFSET      r8,R8-SS
+        CFI_OFFSET      rax,RAX-SS
+        CFI_OFFSET      rcx,RCX-SS
+        CFI_OFFSET      rdx,RDX-SS
+        CFI_OFFSET      rsi,RSI-SS
+        CFI_OFFSET      rdi,RDI-SS
+        CFI_OFFSET      rsp,RSP-SS
+        CFI_OFFSET      rip,RIP-SS
+        .endm
+/*
+ * A newly forked process directly context switches into this.
+ */     
+/* rdi: prev */ 
+ENTRY(ret_from_fork)
+        CFI_STARTPROC
+        CFI_DEFAULT_STACK
+        call schedule_tail
+        GET_THREAD_INFO(%rcx)
+        testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+        jnz rff_trace
+rff_action:     
+        RESTORE_REST
+        testl $3,CS-ARGOFFSET(%rsp)     # from kernel_thread?
+        je   int_ret_from_sys_call
+        testl $_TIF_IA32,threadinfo_flags(%rcx)
+        jnz  int_ret_from_sys_call
+        RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
+        jmp ret_from_sys_call
+rff_trace:
+        movq %rsp,%rdi
+        call syscall_trace_leave
+        GET_THREAD_INFO(%rcx)   
+        jmp rff_action
+        CFI_ENDPROC
+/*
+ * System call entry. Upto 6 arguments in registers are supported.
+ *
+ * SYSCALL does not save anything on the stack and does not change the
+ * stack pointer.
+ */
+                
+/*
+ * Register setup:      
+ * rax  system call number
+ * rdi  arg0
+ * rcx  return address for syscall/sysret, C arg3 
+ * rsi  arg1
+ * rdx  arg2    
+ * r10  arg3    (--> moved to rcx for C)
+ * r8   arg4
+ * r9   arg5
+ * r11  eflags for syscall/sysret, temporary for C
+ * r12-r15,rbp,rbx saved by C code, not touched.                
+ * 
+ * Interrupts are off on entry.
+ * Only called from user space.
+ *
+ * XXX  if we had a free scratch register we could save the RSP into the stack frame
+ *      and report it properly in ps. Unfortunately we haven't.
+ */                                     
+ENTRY(system_call)
+        CFI_STARTPROC
+        swapgs
+        movq    %rsp,%gs:pda_oldrsp 
+        movq    %gs:pda_kernelstack,%rsp
+        sti                                     
+        SAVE_ARGS 8,1
+        movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
+        movq  %rcx,RIP-ARGOFFSET(%rsp)  
+        GET_THREAD_INFO(%rcx)
+        testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
+        jnz tracesys
+        cmpq $__NR_syscall_max,%rax
+        ja badsys
+        movq %r10,%rcx
+        call *sys_call_table(,%rax,8)  # XXX:    rip relative
+        movq %rax,RAX-ARGOFFSET(%rsp)
+/*
+ * Syscall return path ending with SYSRET (fast path)
+ * Has incomplete stack frame and undefined top of stack. 
+ */             
+        .globl ret_from_sys_call
+ret_from_sys_call:
+        movl $_TIF_WORK_MASK,%edi
+        /* edi: flagmask */
+sysret_check:           
+        GET_THREAD_INFO(%rcx)
+        cli
+        movl threadinfo_flags(%rcx),%edx
+        andl %edi,%edx
+        jnz  sysret_careful 
+        movq RIP-ARGOFFSET(%rsp),%rcx
+        RESTORE_ARGS 0,-ARG_SKIP,1
+        movq    %gs:pda_oldrsp,%rsp
+        swapgs
+        sysretq
+        /* Handle reschedules */
+        /* edx: work, edi: workmask */  
+sysret_careful:
+        bt $TIF_NEED_RESCHED,%edx
+        jnc sysret_signal
+        sti
+        pushq %rdi
+        call schedule
+        popq  %rdi
+        jmp sysret_check
+        /* Handle a signal */ 
+sysret_signal:
+        sti
+        testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+        jz    1f
+        /* Really a signal */
+        /* edx: work flags (arg3) */
+        leaq do_notify_resume(%rip),%rax
+        leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
+        xorl %esi,%esi # oldset -> arg2
+        call ptregscall_common
+1:      movl $_TIF_NEED_RESCHED,%edi
+        jmp sysret_check
+        
+        /* Do syscall tracing */
+tracesys:                        
+        SAVE_REST
+        movq $-ENOSYS,RAX(%rsp)
+        FIXUP_TOP_OF_STACK %rdi
+        movq %rsp,%rdi
+        call syscall_trace_enter
+        LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+        RESTORE_REST
+        cmpq $__NR_syscall_max,%rax
+        ja  1f
+        movq %r10,%rcx  /* fixup for C */
+        call *sys_call_table(,%rax,8)
+        movq %rax,RAX-ARGOFFSET(%rsp)
+1:      SAVE_REST
+        movq %rsp,%rdi
+        call syscall_trace_leave
+        RESTORE_TOP_OF_STACK %rbx
+        RESTORE_REST
+        jmp ret_from_sys_call
+                
+badsys:
+        movq $-ENOSYS,RAX-ARGOFFSET(%rsp)       
+        jmp ret_from_sys_call
+/* 
+ * Syscall return path ending with IRET.
+ * Has correct top of stack, but partial stack frame.
+ */     
+ENTRY(int_ret_from_sys_call)    
+        cli
+        testl $3,CS-ARGOFFSET(%rsp)
+        je retint_restore_args
+        movl $_TIF_ALLWORK_MASK,%edi
+        /* edi: mask to check */
+int_with_check:
+        GET_THREAD_INFO(%rcx)
+        movl threadinfo_flags(%rcx),%edx
+        andl %edi,%edx
+        jnz   int_careful
+        jmp   retint_swapgs
+        /* Either reschedule or signal or syscall exit tracking needed. */
+        /* First do a reschedule test. */
+        /* edx: work, edi: workmask */
+int_careful:
+        bt $TIF_NEED_RESCHED,%edx
+        jnc  int_very_careful
+        sti
+        pushq %rdi
+        call schedule
+        popq %rdi
+        jmp int_with_check
+        /* handle signals and tracing -- both require a full stack frame */
+int_very_careful:
+        sti
+        SAVE_REST
+        /* Check for syscall exit trace */      
+        testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
+        jz int_signal
+        pushq %rdi
+        leaq 8(%rsp),%rdi       # &ptregs -> arg1       
+        call syscall_trace_leave
+        popq %rdi
+        btr  $TIF_SYSCALL_TRACE,%edi
+        btr  $TIF_SYSCALL_AUDIT,%edi
+        btr  $TIF_SINGLESTEP,%edi
+        jmp int_restore_rest
+        
+int_signal:
+        testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
+        jz 1f
+        movq %rsp,%rdi          # &ptregs -> arg1
+        xorl %esi,%esi          # oldset -> arg2
+        call do_notify_resume
+1:      movl $_TIF_NEED_RESCHED,%edi    
+int_restore_rest:
+        RESTORE_REST
+        jmp int_with_check
+        CFI_ENDPROC
+                
+/* 
+ * Certain special system calls that need to save a complete full stack frame.
+ */                                                             
+        
+        .macro PTREGSCALL label,func,arg
+        .globl \label
+\label:
+        leaq    \func(%rip),%rax
+        leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
+        jmp     ptregscall_common
+        .endm
+        PTREGSCALL stub_clone, sys_clone, %r8
+        PTREGSCALL stub_fork, sys_fork, %rdi
+        PTREGSCALL stub_vfork, sys_vfork, %rdi
+        PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
+        PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
+        PTREGSCALL stub_iopl, sys_iopl, %rsi
+ENTRY(ptregscall_common)
+        CFI_STARTPROC
+        popq %r11
+        CFI_ADJUST_CFA_OFFSET   -8
+        SAVE_REST
+        movq %r11, %r15
+        FIXUP_TOP_OF_STACK %r11
+        call *%rax
+        RESTORE_TOP_OF_STACK %r11
+        movq %r15, %r11
+        RESTORE_REST
+        pushq %r11
+        CFI_ADJUST_CFA_OFFSET   8
+        ret
+        CFI_ENDPROC
+        
+ENTRY(stub_execve)
+        CFI_STARTPROC
+        popq %r11
+        CFI_ADJUST_CFA_OFFSET   -8
+        SAVE_REST
+        movq %r11, %r15
+        FIXUP_TOP_OF_STACK %r11
+        call sys_execve
+        GET_THREAD_INFO(%rcx)
+        bt $TIF_IA32,threadinfo_flags(%rcx)
+        jc exec_32bit
+        RESTORE_TOP_OF_STACK %r11
+        movq %r15, %r11
+        RESTORE_REST
+        push %r11
+        ret
+exec_32bit:
+        CFI_ADJUST_CFA_OFFSET   REST_SKIP
+        movq %rax,RAX(%rsp)
+        RESTORE_REST
+        jmp int_ret_from_sys_call
+        CFI_ENDPROC
+        
+/*
+ * sigreturn is special because it needs to restore all registers on return.
+ * This cannot be done with SYSRET, so use the IRET return path instead.
+ */                
+ENTRY(stub_rt_sigreturn)
+        CFI_STARTPROC
+        addq $8, %rsp           
+        SAVE_REST
+        movq %rsp,%rdi
+        FIXUP_TOP_OF_STACK %r11
+        call sys_rt_sigreturn
+        movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+        RESTORE_REST
+        jmp int_ret_from_sys_call
+        CFI_ENDPROC
+/* 
+ * Interrupt entry/exit.
+ *
+ * Interrupt entry points save only callee clobbered registers in fast path.
+ *      
+ * Entry runs with interrupts off.      
+ */ 
+/* 0(%rsp): interrupt number */ 
+        .macro interrupt func
+        CFI_STARTPROC   simple
+        CFI_DEF_CFA     rsp,(SS-RDI)
+        CFI_REL_OFFSET  rsp,(RSP-ORIG_RAX)
+        CFI_REL_OFFSET  rip,(RIP-ORIG_RAX)
+        cld
+#ifdef CONFIG_DEBUG_INFO
+        SAVE_ALL        
+        movq %rsp,%rdi
+        /*
+         * Setup a stack frame pointer.  This allows gdb to trace
+         * back to the original stack.
+         */
+        movq %rsp,%rbp
+        CFI_DEF_CFA_REGISTER    rbp
+#else           
+        SAVE_ARGS
+        leaq -ARGOFFSET(%rsp),%rdi      # arg1 for handler
+#endif  
+        testl $3,CS(%rdi)
+        je 1f
+        swapgs  
+1:      addl $1,%gs:pda_irqcount        # RED-PEN should check preempt count
+        movq %gs:pda_irqstackptr,%rax
+        cmoveq %rax,%rsp                                                        
+        pushq %rdi                      # save old stack        
+        call \func
+        .endm
+ENTRY(common_interrupt)
+        interrupt do_IRQ
+        /* 0(%rsp): oldrsp-ARGOFFSET */
+ret_from_intr:          
+        popq  %rdi
+        cli     
+        subl $1,%gs:pda_irqcount
+#ifdef CONFIG_DEBUG_INFO
+        movq RBP(%rdi),%rbp
+#endif
+        leaq ARGOFFSET(%rdi),%rsp
+exit_intr:              
+        GET_THREAD_INFO(%rcx)
+        testl $3,CS-ARGOFFSET(%rsp)
+        je retint_kernel
+        
+        /* Interrupt came from user space */
+        /*
+         * Has a correct top of stack, but a partial stack frame
+         * %rcx: thread info. Interrupts off.
+         */             
+retint_with_reschedule:
+        movl $_TIF_WORK_MASK,%edi
+retint_check:                   
+        movl threadinfo_flags(%rcx),%edx
+        andl %edi,%edx
+        jnz  retint_careful
+retint_swapgs:          
+        cli
+        swapgs 
+retint_restore_args:                            
+        cli
+        RESTORE_ARGS 0,8,0                                              
+iret_label:     
+        iretq
+        .section __ex_table,"a"
+        .quad iret_label,bad_iret       
+        .previous
+        .section .fixup,"ax"
+        /* force a signal here? this matches i386 behaviour */
+        /* running with kernel gs */
+bad_iret:
+        movq $-9999,%rdi        /* better code? */
+        jmp do_exit                     
+        .previous       
+        
+        /* edi: workmask, edx: work */  
+retint_careful:
+        bt    $TIF_NEED_RESCHED,%edx
+        jnc   retint_signal
+        sti
+        pushq %rdi
+        call  schedule
+        popq %rdi               
+        GET_THREAD_INFO(%rcx)
+        cli
+        jmp retint_check
+        
+retint_signal:
+        testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+        jz    retint_swapgs
+        sti
+        SAVE_REST
+        movq $-1,ORIG_RAX(%rsp)                         
+        xorq %rsi,%rsi          # oldset
+        movq %rsp,%rdi          # &pt_regs
+        call do_notify_resume
+        RESTORE_REST
+        cli
+        movl $_TIF_NEED_RESCHED,%edi
+        GET_THREAD_INFO(%rcx)   
+        jmp retint_check
+#ifdef CONFIG_PREEMPT
+        /* Returning to kernel space. Check if we need preemption */
+        /* rcx:  threadinfo. interrupts off. */
+        .p2align
+retint_kernel:  
+        cmpl $0,threadinfo_preempt_count(%rcx)
+        jnz  retint_restore_args
+        bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
+        jnc  retint_restore_args
+        bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
+        jnc  retint_restore_args
+        call preempt_schedule_irq
+        jmp exit_intr
+#endif  
+        CFI_ENDPROC
+        
+/*
+ * APIC interrupts.
+ */             
+        .macro apicinterrupt num,func
+        pushq $\num-256
+        interrupt \func
+        jmp ret_from_intr
+        CFI_ENDPROC
+        .endm
+ENTRY(thermal_interrupt)
+        apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
+#ifdef CONFIG_SMP       
+ENTRY(reschedule_interrupt)
+        apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
+ENTRY(invalidate_interrupt)
+        apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt
+ENTRY(call_function_interrupt)
+        apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC    
+ENTRY(apic_timer_interrupt)
+        apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
+ENTRY(error_interrupt)
+        apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
+ENTRY(spurious_interrupt)
+        apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
+#endif
+                                
+/*
+ * Exception entry points.
+ */             
+        .macro zeroentry sym
+        pushq $0        /* push error code/oldrax */ 
+        pushq %rax      /* push real oldrax to the rdi slot */ 
+        leaq  \sym(%rip),%rax
+        jmp error_entry
+        .endm   
+        .macro errorentry sym
+        pushq %rax
+        leaq  \sym(%rip),%rax
+        jmp error_entry
+        .endm
+        /* error code is on the stack already */
+        /* handle NMI like exceptions that can happen everywhere */
+        .macro paranoidentry sym
+        SAVE_ALL
+        cld
+        movl $1,%ebx
+        movl  $MSR_GS_BASE,%ecx
+        rdmsr
+        testl %edx,%edx
+        js    1f
+        swapgs
+        xorl  %ebx,%ebx
+1:      movq %rsp,%rdi
+        movq ORIG_RAX(%rsp),%rsi
+        movq $-1,ORIG_RAX(%rsp)
+        call \sym
+        .endm
+        
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack
+ * and the exception handler in %rax.   
+ */                                             
+ENTRY(error_entry)
+        CFI_STARTPROC   simple
+        CFI_DEF_CFA     rsp,(SS-RDI)
+        CFI_REL_OFFSET  rsp,(RSP-RDI)
+        CFI_REL_OFFSET  rip,(RIP-RDI)
+        /* rdi slot contains rax, oldrax contains error code */
+        cld     
+        subq  $14*8,%rsp
+        CFI_ADJUST_CFA_OFFSET   (14*8)
+        movq %rsi,13*8(%rsp)
+        CFI_REL_OFFSET  rsi,RSI
+        movq 14*8(%rsp),%rsi    /* load rax from rdi slot */
+        movq %rdx,12*8(%rsp)
+        CFI_REL_OFFSET  rdx,RDX
+        movq %rcx,11*8(%rsp)
+        CFI_REL_OFFSET  rcx,RCX
+        movq %rsi,10*8(%rsp)    /* store rax */ 
+        CFI_REL_OFFSET  rax,RAX
+        movq %r8, 9*8(%rsp)
+        CFI_REL_OFFSET  r8,R8
+        movq %r9, 8*8(%rsp)
+        CFI_REL_OFFSET  r9,R9
+        movq %r10,7*8(%rsp)
+        CFI_REL_OFFSET  r10,R10
+        movq %r11,6*8(%rsp)
+        CFI_REL_OFFSET  r11,R11
+        movq %rbx,5*8(%rsp) 
+        CFI_REL_OFFSET  rbx,RBX
+        movq %rbp,4*8(%rsp) 
+        CFI_REL_OFFSET  rbp,RBP
+        movq %r12,3*8(%rsp) 
+        CFI_REL_OFFSET  r12,R12
+        movq %r13,2*8(%rsp) 
+        CFI_REL_OFFSET  r13,R13
+        movq %r14,1*8(%rsp) 
+        CFI_REL_OFFSET  r14,R14
+        movq %r15,(%rsp) 
+        CFI_REL_OFFSET  r15,R15
+        xorl %ebx,%ebx  
+        testl $3,CS(%rsp)
+        je  error_kernelspace
+error_swapgs:   
+        swapgs
+error_sti:      
+        movq %rdi,RDI(%rsp)     
+        movq %rsp,%rdi
+        movq ORIG_RAX(%rsp),%rsi        /* get error code */ 
+        movq $-1,ORIG_RAX(%rsp)
+        call *%rax
+        /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */     
+error_exit:             
+        movl %ebx,%eax          
+        RESTORE_REST
+        cli
+        GET_THREAD_INFO(%rcx)   
+        testl %eax,%eax
+        jne  retint_kernel
+        movl  threadinfo_flags(%rcx),%edx
+        movl  $_TIF_WORK_MASK,%edi
+        andl  %edi,%edx
+        jnz  retint_careful
+        swapgs 
+        RESTORE_ARGS 0,8,0                                              
+        iretq
+        CFI_ENDPROC
+error_kernelspace:
+        incl %ebx
+       /* There are two places in the kernel that can potentially fault with
+          usergs. Handle them here. The exception handlers after
+           iret run with kernel gs again, so don't set the user space flag.
+           B stepping K8s sometimes report an truncated RIP for IRET 
+           exceptions returning to compat mode. Check for these here too. */
+        leaq iret_label(%rip),%rbp
+        cmpq %rbp,RIP(%rsp) 
+        je   error_swapgs
+        movl %ebp,%ebp  /* zero extend */
+        cmpq %rbp,RIP(%rsp) 
+        je   error_swapgs
+        cmpq $gs_change,RIP(%rsp)
+        je   error_swapgs
+        jmp  error_sti
+        
+       /* Reload gs selector with exception handling */
+       /* edi:  new selector */ 
+ENTRY(load_gs_index)
+        pushf
+        cli
+        swapgs
+gs_change:     
+        movl %edi,%gs   
+2:      mfence          /* workaround */
+        swapgs
+        popf
+        ret
+       
+        .section __ex_table,"a"
+        .align 8
+        .quad gs_change,bad_gs
+        .previous
+        .section .fixup,"ax"
+        /* running with kernelgs */
+bad_gs: 
+        swapgs                  /* switch back to user gs */
+        xorl %eax,%eax
+        movl %eax,%gs
+        jmp  2b
+        .previous       
+        
+/*
+ * Create a kernel thread.
+ *
+ * C extern interface:
+ *      extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+ *
+ * asm input arguments:
+ *      rdi: fn, rsi: arg, rdx: flags
+ */
+ENTRY(kernel_thread)
+        CFI_STARTPROC
+        FAKE_STACK_FRAME $child_rip
+        SAVE_ALL
+        # rdi: flags, rsi: usp, rdx: will be &pt_regs
+        movq %rdx,%rdi
+        orq  kernel_thread_flags(%rip),%rdi
+        movq $-1, %rsi
+        movq %rsp, %rdx
+        xorl %r8d,%r8d
+        xorl %r9d,%r9d
+        
+        # clone now
+        call do_fork
+        movq %rax,RAX(%rsp)
+        xorl %edi,%edi
+        /*
+         * It isn't worth to check for reschedule here,
+         * so internally to the x86_64 port you can rely on kernel_thread()
+         * not to reschedule the child before returning, this avoids the need
+         * of hacks for example to fork off the per-CPU idle tasks.
+         * [Hopefully no generic code relies on the reschedule -AK]     
+         */
+        RESTORE_ALL
+        UNFAKE_STACK_FRAME
+        ret
+        CFI_ENDPROC
+        
+child_rip:
+        /*
+         * Here we are in the child and the registers are set as they were
+         * at kernel_thread() invocation in the parent.
+         */
+        movq %rdi, %rax
+        movq %rsi, %rdi
+        call *%rax
+        # exit
+        xorq %rdi, %rdi
+        call do_exit
+/*
+ * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
+ *
+ * C extern interface:
+ *       extern long execve(char *name, char **argv, char **envp)
+ *
+ * asm input arguments:
+ *      rdi: name, rsi: argv, rdx: envp
+ *
+ * We want to fallback into:
+ *      extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
+ *
+ * do_sys_execve asm fallback arguments:
+ *      rdi: name, rsi: argv, rdx: envp, fake frame on the stack
+ */
+ENTRY(execve)
+        CFI_STARTPROC
+        FAKE_STACK_FRAME $0
+        SAVE_ALL        
+        call sys_execve
+        movq %rax, RAX(%rsp)    
+        RESTORE_REST
+        testq %rax,%rax
+        je int_ret_from_sys_call
+        RESTORE_ARGS
+        UNFAKE_STACK_FRAME
+        ret
+        CFI_ENDPROC
+ENTRY(page_fault)
+        errorentry do_page_fault
+ENTRY(coprocessor_error)
+        zeroentry do_coprocessor_error
+ENTRY(simd_coprocessor_error)
+        zeroentry do_simd_coprocessor_error     
+ENTRY(device_not_available)
+        zeroentry math_state_restore
+        /* runs on exception stack */
+ENTRY(debug)
+        CFI_STARTPROC
+        pushq $0
+        CFI_ADJUST_CFA_OFFSET 8         
+        paranoidentry do_debug
+        /* switch back to process stack to restore the state ptrace touched */
+        movq %rax,%rsp  
+        testl $3,CS(%rsp)
+        jnz   paranoid_userspace        
+        jmp paranoid_exit
+        CFI_ENDPROC
+        /* runs on exception stack */   
+ENTRY(nmi)
+        CFI_STARTPROC
+        pushq $-1
+        CFI_ADJUST_CFA_OFFSET 8         
+        paranoidentry do_nmi
+        /* ebx: no swapgs flag */
+paranoid_exit:
+        testl %ebx,%ebx                         /* swapgs needed? */
+        jnz paranoid_restore
+paranoid_swapgs:        
+        cli
+        swapgs
+paranoid_restore:       
+        RESTORE_ALL 8
+        iretq
+paranoid_userspace:     
+        cli
+        GET_THREAD_INFO(%rcx)
+        movl threadinfo_flags(%rcx),%edx
+        testl $_TIF_NEED_RESCHED,%edx
+        jnz paranoid_resched
+        testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+        jnz paranoid_signal
+        jmp paranoid_swapgs
+paranoid_resched:               
+        sti
+        call schedule
+        jmp paranoid_exit
+paranoid_signal:                
+        sti
+        xorl %esi,%esi /* oldset */
+        movq %rsp,%rdi /* &pt_regs */
+        call do_notify_resume
+        jmp paranoid_exit
+        CFI_ENDPROC
+        
+ENTRY(int3)
+        zeroentry do_int3       
+ENTRY(overflow)
+        zeroentry do_overflow
+ENTRY(bounds)
+        zeroentry do_bounds
+ENTRY(invalid_op)
+        zeroentry do_invalid_op 
+ENTRY(coprocessor_segment_overrun)
+        zeroentry do_coprocessor_segment_overrun
+ENTRY(reserved)
+        zeroentry do_reserved
+        /* runs on exception stack */
+ENTRY(double_fault)
+        CFI_STARTPROC
+        paranoidentry do_double_fault
+        movq %rax,%rsp
+        testl $3,CS(%rsp)
+        jnz paranoid_userspace          
+        jmp paranoid_exit
+        CFI_ENDPROC
+ENTRY(invalid_TSS)
+        errorentry do_invalid_TSS
+ENTRY(segment_not_present)
+        errorentry do_segment_not_present
+        /* runs on exception stack */
+ENTRY(stack_segment)
+        CFI_STARTPROC
+        paranoidentry do_stack_segment
+        movq %rax,%rsp
+        testl $3,CS(%rsp)
+        jnz paranoid_userspace
+        jmp paranoid_exit
+        CFI_ENDPROC
+ENTRY(general_protection)
+        errorentry do_general_protection
+ENTRY(alignment_check)
+        errorentry do_alignment_check
+ENTRY(divide_error)
+        zeroentry do_divide_error
+ENTRY(spurious_interrupt_bug)
+        zeroentry do_spurious_interrupt_bug
+#ifdef CONFIG_X86_MCE
+        /* runs on exception stack */
+ENTRY(machine_check)
+        CFI_STARTPROC
+        pushq $0
+        CFI_ADJUST_CFA_OFFSET 8 
+        paranoidentry do_machine_check
+        jmp paranoid_exit
+        CFI_ENDPROC
+#endif
+ENTRY(call_debug)
+       zeroentry do_call_debug
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
new file mode 100644
index 000000000000..d2c42fb99cb3
--- /dev/null
+++ b/arch/x86_64/kernel/genapic.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch probe layer.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/smp.h>
+#include <asm/ipi.h>
+/* which logical CPU number maps to which CPU (physical APIC ID) */
+u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+EXPORT_SYMBOL(x86_cpu_to_apicid);
+u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+extern struct genapic apic_cluster;
+extern struct genapic apic_flat;
+struct genapic *genapic = &apic_flat;
+/*
+ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
+ */
+void __init clustered_apic_check(void)
+{
+        long i;
+        u8 clusters, max_cluster;
+        u8 id;
+        u8 cluster_cnt[NUM_APIC_CLUSTERS];
+        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+                /* AMD always uses flat mode right now */
+                genapic = &apic_flat;
+                goto print;
+        }
+        memset(cluster_cnt, 0, sizeof(cluster_cnt));
+        for (i = 0; i < NR_CPUS; i++) {
+                id = bios_cpu_apicid[i];
+                if (id != BAD_APICID)
+                        cluster_cnt[APIC_CLUSTERID(id)]++;
+        }
+        clusters = 0;
+        max_cluster = 0;
+        for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+                if (cluster_cnt[i] > 0) {
+                        ++clusters;
+                        if (cluster_cnt[i] > max_cluster)
+                                max_cluster = cluster_cnt[i];
+                }
+        }
+        /*
+         * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
+         * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
+         * else physical mode.
+         * (We don't use lowest priority delivery + HW APIC IRQ steering, so
+         * can ignore the clustered logical case and go straight to physical.)
+         */
+        if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster)
+                genapic = &apic_flat;
+        else
+                genapic = &apic_cluster;
+print:
+        printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
+}
+/* Same for both flat and clustered. */
+void send_IPI_self(int vector)
+{
+        __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+}
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c
new file mode 100644
index 000000000000..9703da7202e3
--- /dev/null
+++ b/arch/x86_64/kernel/genapic_cluster.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Clustered APIC subarch code.  Up to 255 CPUs, physical delivery.
+ * (A more realistic maximum is around 230 CPUs.)
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <asm/smp.h>
+#include <asm/ipi.h>
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116).  So here it goes...
+ */
+static void cluster_init_apic_ldr(void)
+{
+        unsigned long val, id;
+        long i, count;
+        u8 lid;
+        u8 my_id = hard_smp_processor_id();
+        u8 my_cluster = APIC_CLUSTER(my_id);
+        /* Create logical APIC IDs by counting CPUs already in cluster. */
+        for (count = 0, i = NR_CPUS; --i >= 0; ) {
+                lid = x86_cpu_to_log_apicid[i];
+                if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
+                        ++count;
+        }
+        /*
+         * We only have a 4 wide bitmap in cluster mode.  There's no way
+         * to get above 60 CPUs and still give each one it's own bit.
+         * But, we're using physical IRQ delivery, so we don't care.
+         * Use bit 3 for the 4th through Nth CPU in each cluster.
+         */
+        if (count >= XAPIC_DEST_CPUS_SHIFT)
+                count = 3;
+        id = my_cluster | (1UL << count);
+        x86_cpu_to_log_apicid[smp_processor_id()] = id;
+        apic_write_around(APIC_DFR, APIC_DFR_CLUSTER);
+        val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+        val |= SET_APIC_LOGICAL_ID(id);
+        apic_write_around(APIC_LDR, val);
+}
+/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
+static cpumask_t cluster_target_cpus(void)
+{
+        return cpumask_of_cpu(0);
+}
+static void cluster_send_IPI_mask(cpumask_t mask, int vector)
+{
+        send_IPI_mask_sequence(mask, vector);
+}
+static void cluster_send_IPI_allbutself(int vector)
+{
+        cpumask_t mask = cpu_online_map;
+        cpu_clear(smp_processor_id(), mask);
+        if (!cpus_empty(mask))
+                cluster_send_IPI_mask(mask, vector);
+}
+static void cluster_send_IPI_all(int vector)
+{
+        cluster_send_IPI_mask(cpu_online_map, vector);
+}
+static int cluster_apic_id_registered(void)
+{
+        return 1;
+}
+static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+        int cpu;
+        /*
+         * We're using fixed IRQ delivery, can only return one phys APIC ID.
+         * May as well be the first.
+         */
+        cpu = first_cpu(cpumask);
+        if ((unsigned)cpu < NR_CPUS)
+                return x86_cpu_to_apicid[cpu];
+        else
+                return BAD_APICID;
+}
+/* cpuid returns the value latched in the HW at reset, not the APIC ID
+ * register's value.  For any box whose BIOS changes APIC IDs, like
+ * clustered APIC systems, we must use hard_smp_processor_id.
+ *
+ * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
+ */
+static unsigned int phys_pkg_id(int index_msb)
+{
+        return hard_smp_processor_id() >> index_msb;
+}
+struct genapic apic_cluster = {
+        .name = "clustered",
+        .int_delivery_mode = dest_Fixed,
+        .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
+        .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED,
+        .target_cpus = cluster_target_cpus,
+        .apic_id_registered = cluster_apic_id_registered,
+        .init_apic_ldr = cluster_init_apic_ldr,
+        .send_IPI_all = cluster_send_IPI_all,
+        .send_IPI_allbutself = cluster_send_IPI_allbutself,
+        .send_IPI_mask = cluster_send_IPI_mask,
+        .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid,
+        .phys_pkg_id = phys_pkg_id,
+};
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
new file mode 100644
index 000000000000..b4cbbad04226
--- /dev/null
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Flat APIC subarch code.  Maximum 8 CPUs, logical delivery.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <asm/smp.h>
+#include <asm/ipi.h>
+static cpumask_t flat_target_cpus(void)
+{
+        return cpu_online_map;
+}
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116).  So here it goes...
+ */
+static void flat_init_apic_ldr(void)
+{
+        unsigned long val;
+        unsigned long num, id;
+        num = smp_processor_id();
+        id = 1UL << num;
+        x86_cpu_to_log_apicid[num] = id;
+        apic_write_around(APIC_DFR, APIC_DFR_FLAT);
+        val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+        val |= SET_APIC_LOGICAL_ID(id);
+        apic_write_around(APIC_LDR, val);
+}
+static void flat_send_IPI_allbutself(int vector)
+{
+        /*
+         * if there are no other CPUs in the system then
+         * we get an APIC send error if we try to broadcast.
+         * thus we have to avoid sending IPIs in this case.
+         */
+        if (num_online_cpus() > 1)
+                __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
+}
+static void flat_send_IPI_all(int vector)
+{
+        __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+}
+static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
+{
+        unsigned long mask = cpus_addr(cpumask)[0];
+        unsigned long cfg;
+        unsigned long flags;
+        local_save_flags(flags);
+        local_irq_disable();
+        /*
+         * Wait for idle.
+         */
+        apic_wait_icr_idle();
+        /*
+         * prepare target chip field
+         */
+        cfg = __prepare_ICR2(mask);
+        apic_write_around(APIC_ICR2, cfg);
+        /*
+         * program the ICR
+         */
+        cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL);
+        /*
+         * Send the IPI. The write to APIC_ICR fires this off.
+         */
+        apic_write_around(APIC_ICR, cfg);
+        local_irq_restore(flags);
+}
+static int flat_apic_id_registered(void)
+{
+        return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
+}
+static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+        return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
+}
+static unsigned int phys_pkg_id(int index_msb)
+{
+        u32 ebx;
+        ebx = cpuid_ebx(1);
+        return ((ebx >> 24) & 0xFF) >> index_msb;
+}
+struct genapic apic_flat =  {
+        .name = "flat",
+        .int_delivery_mode = dest_LowestPrio,
+        .int_dest_mode = (APIC_DEST_LOGICAL != 0),
+        .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
+        .target_cpus = flat_target_cpus,
+        .apic_id_registered = flat_apic_id_registered,
+        .init_apic_ldr = flat_init_apic_ldr,
+        .send_IPI_all = flat_send_IPI_all,
+        .send_IPI_allbutself = flat_send_IPI_allbutself,
+        .send_IPI_mask = flat_send_IPI_mask,
+        .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+        .phys_pkg_id = phys_pkg_id,
+};
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
new file mode 100644
index 000000000000..b6d8725c1f61
--- /dev/null
+++ b/arch/x86_64/kernel/head.S
@@ -0,0 +1,396 @@
+/*
+ *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
+ *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
+ *
+ *  $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
+ */
+#include <linux/linkage.h>
+#include <linux/threads.h>
+#include <asm/desc.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/cache.h>
+        
+/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
+ * because we need identity-mapped pages on setup so define __START_KERNEL to
+ * 0x100000 for this stage
+ * 
+ */
+        .text
+        .code32
+        .globl startup_32
+/* %bx:  1 if coming from smp trampoline on secondary cpu */ 
+startup_32:
+        
+        /*
+         * At this point the CPU runs in 32bit protected mode (CS.D = 1) with
+         * paging disabled and the point of this file is to switch to 64bit
+         * long mode with a kernel mapping for kerneland to jump into the
+         * kernel virtual addresses.
+         * There is no stack until we set one up.
+         */
+        /* Initialize the %ds segment register */
+        movl $__KERNEL_DS,%eax
+        movl %eax,%ds
+        /* Load new GDT with the 64bit segments using 32bit descriptor */
+        lgdt    pGDT32 - __START_KERNEL_map
+        /* If the CPU doesn't support CPUID this will double fault.
+         * Unfortunately it is hard to check for CPUID without a stack. 
+         */
+        
+        /* Check if extended functions are implemented */               
+        movl    $0x80000000, %eax
+        cpuid
+        cmpl    $0x80000000, %eax
+        jbe     no_long_mode
+        /* Check if long mode is implemented */
+        mov     $0x80000001, %eax
+        cpuid
+        btl     $29, %edx
+        jnc     no_long_mode
+        /*
+         * Prepare for entering 64bits mode
+         */
+        /* Enable PAE mode */
+        xorl    %eax, %eax
+        btsl    $5, %eax
+        movl    %eax, %cr4
+        /* Setup early boot stage 4 level pagetables */
+        movl    $(init_level4_pgt - __START_KERNEL_map), %eax
+        movl    %eax, %cr3
+        /* Setup EFER (Extended Feature Enable Register) */
+        movl    $MSR_EFER, %ecx
+        rdmsr
+        /* Enable Long Mode */
+        btsl    $_EFER_LME, %eax
+                                
+        /* Make changes effective */
+        wrmsr
+        xorl    %eax, %eax
+        btsl    $31, %eax                       /* Enable paging and in turn activate Long Mode */
+        btsl    $0, %eax                        /* Enable protected mode */
+        /* Make changes effective */
+        movl    %eax, %cr0
+        /*
+         * At this point we're in long mode but in 32bit compatibility mode
+         * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
+         * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
+         * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+         */
+        ljmp    $__KERNEL_CS, $(startup_64 - __START_KERNEL_map)
+        .code64
+        .org 0x100      
+        .globl startup_64
+startup_64:
+        /* We come here either from startup_32
+         * or directly from a 64bit bootloader.
+         * Since we may have come directly from a bootloader we
+         * reload the page tables here.
+         */
+        /* Enable PAE mode and PGE */
+        xorq    %rax, %rax
+        btsq    $5, %rax
+        btsq    $7, %rax
+        movq    %rax, %cr4
+        /* Setup early boot stage 4 level pagetables. */
+        movq    $(init_level4_pgt - __START_KERNEL_map), %rax
+        movq    %rax, %cr3
+        /* Check if nx is implemented */
+        movl    $0x80000001, %eax
+        cpuid
+        movl    %edx,%edi
+        /* Setup EFER (Extended Feature Enable Register) */
+        movl    $MSR_EFER, %ecx
+        rdmsr
+        /* Enable System Call */
+        btsl    $_EFER_SCE, %eax
+        /* No Execute supported? */
+        btl     $20,%edi
+        jnc     1f
+        btsl    $_EFER_NX, %eax
+1:
+        /* Make changes effective */
+        wrmsr
+        /* Setup cr0 */
+        xorq    %rax, %rax
+        btsq    $31, %rax                       /* Enable paging */
+        btsq    $0, %rax                        /* Enable protected mode */
+        btsq    $1, %rax                        /* Enable MP */
+        btsq    $4, %rax                        /* Enable ET */
+        btsq    $5, %rax                        /* Enable NE */
+        btsq    $16, %rax                       /* Enable WP */
+        btsq    $18, %rax                       /* Enable AM */
+        /* Make changes effective */
+        movq    %rax, %cr0
+        /* Setup a boot time stack */
+        movq init_rsp(%rip),%rsp
+        /* zero EFLAGS after setting rsp */
+        pushq $0
+        popfq
+        /*
+         * We must switch to a new descriptor in kernel space for the GDT
+         * because soon the kernel won't have access anymore to the userspace
+         * addresses where we're currently running on. We have to do that here
+         * because in 32bit we couldn't load a 64bit linear address.
+         */
+        lgdt    cpu_gdt_descr
+        /* 
+         * Setup up a dummy PDA. this is just for some early bootup code
+         * that does in_interrupt() 
+         */ 
+        movl    $MSR_GS_BASE,%ecx
+        movq    $empty_zero_page,%rax
+        movq    %rax,%rdx
+        shrq    $32,%rdx
+        wrmsr   
+        /* set up data segments. actually 0 would do too */
+        movl $__KERNEL_DS,%eax
+        movl %eax,%ds   
+        movl %eax,%ss
+        movl %eax,%es
+                        
+        /* esi is pointer to real mode structure with interesting info.
+           pass it to C */
+        movl    %esi, %edi
+        
+        /* Finally jump to run C code and to be on real kernel address
+         * Since we are running on identity-mapped space we have to jump
+         * to the full 64bit address , this is only possible as indirect
+         * jump
+         */
+        movq    initial_code(%rip),%rax
+        jmp     *%rax
+        /* SMP bootup changes these two */      
+        .globl  initial_code
+initial_code:
+        .quad   x86_64_start_kernel
+        .globl init_rsp
+init_rsp:
+        .quad  init_thread_union+THREAD_SIZE-8
+ENTRY(early_idt_handler)
+        xorl %eax,%eax
+        movq 8(%rsp),%rsi       # get rip
+        movq (%rsp),%rdx
+        movq %cr2,%rcx
+        leaq early_idt_msg(%rip),%rdi
+        call early_printk
+1:      hlt
+        jmp 1b
+early_idt_msg:
+        .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n"
+.code32
+ENTRY(no_long_mode)
+        /* This isn't an x86-64 CPU so hang */
+1:
+        jmp     1b
+.org 0xf00
+        .globl pGDT32
+pGDT32:
+        .word   gdt_end-cpu_gdt_table
+        .long   cpu_gdt_table-__START_KERNEL_map
+.org 0xf10      
+ljumpvector:
+        .long   startup_64-__START_KERNEL_map
+        .word   __KERNEL_CS
+ENTRY(stext)
+ENTRY(_stext)
+        /*
+         * This default setting generates an ident mapping at address 0x100000
+         * and a mapping for the kernel that precisely maps virtual address
+         * 0xffffffff80000000 to physical address 0x000000. (always using
+         * 2Mbyte large pages provided by PAE mode)
+         */
+.org 0x1000
+ENTRY(init_level4_pgt)
+        .quad   0x0000000000102007              /* -> level3_ident_pgt */
+        .fill   255,8,0
+        .quad   0x000000000010a007
+        .fill   254,8,0
+        /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+        .quad   0x0000000000103007              /* -> level3_kernel_pgt */
+.org 0x2000
+ENTRY(level3_ident_pgt)
+        .quad   0x0000000000104007
+        .fill   511,8,0
+.org 0x3000
+ENTRY(level3_kernel_pgt)
+        .fill   510,8,0
+        /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
+        .quad   0x0000000000105007              /* -> level2_kernel_pgt */
+        .fill   1,8,0
+.org 0x4000
+ENTRY(level2_ident_pgt)
+        /* 40MB for bootup.     */
+        .quad   0x0000000000000283
+        .quad   0x0000000000200183
+        .quad   0x0000000000400183
+        .quad   0x0000000000600183
+        .quad   0x0000000000800183
+        .quad   0x0000000000A00183
+        .quad   0x0000000000C00183
+        .quad   0x0000000000E00183
+        .quad   0x0000000001000183
+        .quad   0x0000000001200183
+        .quad   0x0000000001400183
+        .quad   0x0000000001600183
+        .quad   0x0000000001800183
+        .quad   0x0000000001A00183
+        .quad   0x0000000001C00183
+        .quad   0x0000000001E00183
+        .quad   0x0000000002000183
+        .quad   0x0000000002200183
+        .quad   0x0000000002400183
+        .quad   0x0000000002600183
+        /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */
+        .globl temp_boot_pmds
+temp_boot_pmds:
+        .fill   492,8,0
+        
+.org 0x5000
+ENTRY(level2_kernel_pgt)
+        /* 40MB kernel mapping. The kernel code cannot be bigger than that.
+           When you change this change KERNEL_TEXT_SIZE in page.h too. */
+        /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
+        .quad   0x0000000000000183
+        .quad   0x0000000000200183
+        .quad   0x0000000000400183
+        .quad   0x0000000000600183
+        .quad   0x0000000000800183
+        .quad   0x0000000000A00183
+        .quad   0x0000000000C00183
+        .quad   0x0000000000E00183
+        .quad   0x0000000001000183
+        .quad   0x0000000001200183
+        .quad   0x0000000001400183
+        .quad   0x0000000001600183
+        .quad   0x0000000001800183
+        .quad   0x0000000001A00183
+        .quad   0x0000000001C00183
+        .quad   0x0000000001E00183
+        .quad   0x0000000002000183
+        .quad   0x0000000002200183
+        .quad   0x0000000002400183
+        .quad   0x0000000002600183
+        /* Module mapping starts here */
+        .fill   492,8,0
+.org 0x6000
+ENTRY(empty_zero_page)
+.org 0x7000
+ENTRY(empty_bad_page)
+.org 0x8000
+ENTRY(empty_bad_pte_table)
+.org 0x9000
+ENTRY(empty_bad_pmd_table)
+.org 0xa000
+ENTRY(level3_physmem_pgt)
+        .quad   0x0000000000105007              /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
+        .org 0xb000
+#ifdef CONFIG_ACPI_SLEEP
+ENTRY(wakeup_level4_pgt)
+        .quad   0x0000000000102007              /* -> level3_ident_pgt */
+        .fill   255,8,0
+        .quad   0x000000000010a007
+        .fill   254,8,0
+        /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+        .quad   0x0000000000103007              /* -> level3_kernel_pgt */
+#endif
+        .data
+        .align 16
+        .globl cpu_gdt_descr
+cpu_gdt_descr:
+        .word   gdt_end-cpu_gdt_table
+gdt:
+        .quad   cpu_gdt_table
+#ifdef CONFIG_SMP
+        .rept   NR_CPUS-1
+        .word   0
+        .quad   0
+        .endr
+#endif
+/* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types  kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout 
+ */
+                                
+.align L1_CACHE_BYTES
+/* The TLS descriptors are currently at a different place compared to i386.
+   Hopefully nobody expects them at a fixed place (Wine?) */
+        
+ENTRY(cpu_gdt_table)
+        .quad   0x0000000000000000      /* NULL descriptor */
+        .quad   0x008f9a000000ffff      /* __KERNEL_COMPAT32_CS */      
+        .quad   0x00af9a000000ffff      /* __KERNEL_CS */
+        .quad   0x00cf92000000ffff      /* __KERNEL_DS */
+        .quad   0x00cffa000000ffff      /* __USER32_CS */
+        .quad   0x00cff2000000ffff      /* __USER_DS, __USER32_DS  */           
+        .quad   0x00affa000000ffff      /* __USER_CS */
+        .quad   0x00cf9a000000ffff      /* __KERNEL32_CS */
+        .quad   0,0                     /* TSS */
+        .quad   0,0                     /* LDT */
+        .quad   0,0,0                   /* three TLS descriptors */ 
+        .quad   0x00009a000000ffff      /* __KERNEL16_CS - 16bit PM for S3 wakeup. */
+                                        /* base must be patched for real base address. */
+gdt_end:        
+        /* asm/segment.h:GDT_ENTRIES must match this */ 
+        /* This should be a multiple of the cache line size */
+        /* GDTs of other CPUs: */       
+        .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table)
+        .align  L1_CACHE_BYTES
+ENTRY(idt_table)        
+        .rept   256
+        .quad   0
+        .quad   0
+        .endr
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
new file mode 100644
index 000000000000..6cad46c98a23
--- /dev/null
+++ b/arch/x86_64/kernel/head64.c
@@ -0,0 +1,117 @@
+/*
+ *  linux/arch/x86_64/kernel/head64.c -- prepare to run common code
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *
+ *  $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $
+ */
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/percpu.h>
+#include <asm/processor.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/bootsetup.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+/* Don't add a printk in there. printk relies on the PDA which is not initialized 
+   yet. */
+static void __init clear_bss(void)
+{
+        extern char __bss_start[], __bss_end[];
+        memset(__bss_start, 0,
+               (unsigned long) __bss_end - (unsigned long) __bss_start);
+}
+extern char x86_boot_params[2048];
+#define NEW_CL_POINTER          0x228   /* Relative to real mode data */
+#define OLD_CL_MAGIC_ADDR       0x90020
+#define OLD_CL_MAGIC            0xA33F
+#define OLD_CL_BASE_ADDR        0x90000
+#define OLD_CL_OFFSET           0x90022
+extern char saved_command_line[];
+static void __init copy_bootdata(char *real_mode_data)
+{
+        int new_data;
+        char * command_line;
+        memcpy(x86_boot_params, real_mode_data, 2048); 
+        new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
+        if (!new_data) {
+                if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
+                        printk("so old bootloader that it does not support commandline?!\n");
+                        return;
+                }
+                new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
+                printk("old bootloader convention, maybe loadlin?\n");
+        }
+        command_line = (char *) ((u64)(new_data));
+        memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
+        printk("Bootdata ok (command line is %s)\n", saved_command_line);       
+}
+static void __init setup_boot_cpu_data(void)
+{
+        unsigned int dummy, eax;
+        /* get vendor info */
+        cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
+              (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
+              (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
+              (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
+        /* get cpu type */
+        cpuid(1, &eax, &dummy, &dummy,
+                (unsigned int *) &boot_cpu_data.x86_capability);
+        boot_cpu_data.x86 = (eax >> 8) & 0xf;
+        boot_cpu_data.x86_model = (eax >> 4) & 0xf;
+        boot_cpu_data.x86_mask = eax & 0xf;
+}
+extern char _end[];
+void __init x86_64_start_kernel(char * real_mode_data)
+{
+        char *s;
+        int i;
+        for (i = 0; i < 256; i++)
+                set_intr_gate(i, early_idt_handler);
+        asm volatile("lidt %0" :: "m" (idt_descr));
+        clear_bss();
+        pda_init(0);
+        copy_bootdata(real_mode_data);
+#ifdef CONFIG_SMP
+        cpu_set(0, cpu_online_map);
+#endif
+        /* default console: */
+        if (!strstr(saved_command_line, "console="))
+                strcat(saved_command_line, " console=tty0"); 
+        s = strstr(saved_command_line, "earlyprintk=");
+        if (s != NULL)
+                setup_early_printk(s);
+#ifdef CONFIG_DISCONTIGMEM
+        s = strstr(saved_command_line, "numa=");
+        if (s != NULL)
+                numa_setup(s+5);
+#endif
+#ifdef CONFIG_X86_IO_APIC
+        if (strstr(saved_command_line, "disableapic"))
+                disable_apic = 1;
+#endif
+        /* You need early console to see that */
+        if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
+                panic("Kernel too big for kernel mapping\n");
+        setup_boot_cpu_data();
+        start_kernel();
+}
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
new file mode 100644
index 000000000000..ba139cac57ce
--- /dev/null
+++ b/arch/x86_64/kernel/i387.c
@@ -0,0 +1,155 @@
+/*
+ *  linux/arch/x86_64/kernel/i387.c
+ *
+ *  Copyright (C) 1994 Linus Torvalds
+ *  Copyright (C) 2002 Andi Kleen, SuSE Labs
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *      Gareth Hughes <gareth@valinux.com>, May 2000
+ * 
+ *  x86-64 rework 2002 Andi Kleen. 
+ *  Does direct fxsave in and out of user space now for signal handlers.
+ *  All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
+ *  the 64bit user space sees a FXSAVE frame directly. 
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+unsigned int mxcsr_feature_mask = 0xffffffff;
+void mxcsr_feature_mask_init(void)
+{
+        unsigned int mask;
+        clts();
+        memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+        asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
+        mask = current->thread.i387.fxsave.mxcsr_mask;
+        if (mask == 0) mask = 0x0000ffbf;
+        mxcsr_feature_mask &= mask;
+        stts();
+}
+/*
+ * Called at bootup to set up the initial FPU state that is later cloned
+ * into all processes.
+ */
+void __init fpu_init(void)
+{
+        unsigned long oldcr0 = read_cr0();
+        extern void __bad_fxsave_alignment(void);
+                
+        if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
+                __bad_fxsave_alignment();
+        set_in_cr4(X86_CR4_OSFXSR);
+        set_in_cr4(X86_CR4_OSXMMEXCPT);
+        write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
+        mxcsr_feature_mask_init();
+        /* clean state in init */
+        current_thread_info()->status = 0;
+        clear_used_math();
+}
+void init_fpu(struct task_struct *child)
+{
+        if (tsk_used_math(child)) {
+                if (child == current)
+                        unlazy_fpu(child);
+                return;
+        }       
+        memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+        child->thread.i387.fxsave.cwd = 0x37f;
+        child->thread.i387.fxsave.mxcsr = 0x1f80;
+        /* only the device not available exception or ptrace can call init_fpu */
+        set_stopped_child_used_math(child);
+}
+/*
+ * Signal frame handlers.
+ */
+int save_i387(struct _fpstate __user *buf)
+{
+        struct task_struct *tsk = current;
+        int err = 0;
+        { 
+                extern void bad_user_i387_struct(void); 
+                if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave))
+                        bad_user_i387_struct();
+        } 
+        if ((unsigned long)buf % 16) 
+                printk("save_i387: bad fpstate %p\n",buf); 
+        if (!used_math())
+                return 0;
+        clear_used_math(); /* trigger finit */
+        if (tsk->thread_info->status & TS_USEDFPU) {
+                err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
+                if (err) return err;
+                stts();
+                } else {
+                if (__copy_to_user(buf, &tsk->thread.i387.fxsave, 
+                                   sizeof(struct i387_fxsave_struct)))
+                        return -1;
+        } 
+                return 1;
+}
+/*
+ * ptrace request handlers.
+ */
+int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
+{
+        init_fpu(tsk);
+        return __copy_to_user(buf, &tsk->thread.i387.fxsave,
+                               sizeof(struct user_i387_struct)) ? -EFAULT : 0;
+}
+int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
+{
+        if (__copy_from_user(&tsk->thread.i387.fxsave, buf, 
+                             sizeof(struct user_i387_struct)))
+                return -EFAULT;
+                return 0;
+}
+/*
+ * FPU state for core dumps.
+ */
+int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
+{
+        struct task_struct *tsk = current;
+        if (!used_math())
+                return 0;
+        unlazy_fpu(tsk);
+        memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); 
+        return 1; 
+}
+int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
+{
+        int fpvalid = !!tsk_used_math(tsk);
+        if (fpvalid) {
+                if (tsk == current)
+                        unlazy_fpu(tsk);
+                memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));         
+}
+        return fpvalid;
+}
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
new file mode 100644
index 000000000000..d52701246010
--- /dev/null
+++ b/arch/x86_64/kernel/i8259.c
@@ -0,0 +1,579 @@
+#include <linux/linkage.h>
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+#include <asm/acpi.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <linux/irq.h>
+/*
+ * Common place to define all x86 IRQ vectors
+ *
+ * This builds up the IRQ handler stubs using some ugly macros in irq.h
+ *
+ * These macros create the low-level assembly IRQ routines that save
+ * register context and call do_IRQ(). do_IRQ() then does all the
+ * operations that are needed to keep the AT (or SMP IOAPIC)
+ * interrupt-controller happy.
+ */
+#define BI(x,y) \
+        BUILD_IRQ(x##y)
+#define BUILD_16_IRQS(x) \
+        BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
+        BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
+        BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
+        BI(x,c) BI(x,d) BI(x,e) BI(x,f)
+#define BUILD_14_IRQS(x) \
+        BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
+        BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
+        BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
+        BI(x,c) BI(x,d)
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x20-0x2f)
+ */
+BUILD_16_IRQS(0x0)
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these 
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+                   BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
+BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
+BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
+BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
+#ifdef CONFIG_PCI_MSI
+        BUILD_14_IRQS(0xe)
+#endif
+#endif
+#undef BUILD_16_IRQS
+#undef BUILD_14_IRQS
+#undef BI
+#define IRQ(x,y) \
+        IRQ##x##y##_interrupt
+#define IRQLIST_16(x) \
+        IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
+        IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
+        IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
+        IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
+#define IRQLIST_14(x) \
+        IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
+        IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
+        IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
+        IRQ(x,c), IRQ(x,d)
+void (*interrupt[NR_IRQS])(void) = {
+        IRQLIST_16(0x0),
+#ifdef CONFIG_X86_IO_APIC
+                         IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3),
+        IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
+        IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
+        IRQLIST_16(0xc), IRQLIST_16(0xd)
+#ifdef CONFIG_PCI_MSI
+        , IRQLIST_14(0xe)
+#endif
+#endif
+};
+#undef IRQ
+#undef IRQLIST_16
+#undef IRQLIST_14
+/*
+ * This is the 'legacy' 8259A Programmable Interrupt Controller,
+ * present in the majority of PC/AT boxes.
+ * plus some generic x86 specific things if generic specifics makes
+ * any sense at all.
+ * this file should become arch/i386/kernel/irq.c when the old irq.c
+ * moves to arch independent land
+ */
+DEFINE_SPINLOCK(i8259A_lock);
+static void end_8259A_irq (unsigned int irq)
+{
+        if (irq > 256) { 
+                char var;
+                printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, current->thread_info); 
+                BUG(); 
+        }
+        if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) &&
+            irq_desc[irq].action)
+                enable_8259A_irq(irq);
+}
+#define shutdown_8259A_irq      disable_8259A_irq
+static void mask_and_ack_8259A(unsigned int);
+static unsigned int startup_8259A_irq(unsigned int irq)
+{ 
+        enable_8259A_irq(irq);
+        return 0; /* never anything pending */
+}
+static struct hw_interrupt_type i8259A_irq_type = {
+        "XT-PIC",
+        startup_8259A_irq,
+        shutdown_8259A_irq,
+        enable_8259A_irq,
+        disable_8259A_irq,
+        mask_and_ack_8259A,
+        end_8259A_irq,
+        NULL
+};
+/*
+ * 8259A PIC functions to handle ISA devices:
+ */
+/*
+ * This contains the irq mask for both 8259A irq controllers,
+ */
+static unsigned int cached_irq_mask = 0xffff;
+#define __byte(x,y)     (((unsigned char *)&(y))[x])
+#define cached_21       (__byte(0,cached_irq_mask))
+#define cached_A1       (__byte(1,cached_irq_mask))
+/*
+ * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
+ * boards the timer interrupt is not really connected to any IO-APIC pin,
+ * it's fed to the master 8259A's IR0 line only.
+ *
+ * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
+ * this 'mixed mode' IRQ handling costs nothing because it's only used
+ * at IRQ setup time.
+ */
+unsigned long io_apic_irqs;
+void disable_8259A_irq(unsigned int irq)
+{
+        unsigned int mask = 1 << irq;
+        unsigned long flags;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        cached_irq_mask |= mask;
+        if (irq & 8)
+                outb(cached_A1,0xA1);
+        else
+                outb(cached_21,0x21);
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+void enable_8259A_irq(unsigned int irq)
+{
+        unsigned int mask = ~(1 << irq);
+        unsigned long flags;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        cached_irq_mask &= mask;
+        if (irq & 8)
+                outb(cached_A1,0xA1);
+        else
+                outb(cached_21,0x21);
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+int i8259A_irq_pending(unsigned int irq)
+{
+        unsigned int mask = 1<<irq;
+        unsigned long flags;
+        int ret;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        if (irq < 8)
+                ret = inb(0x20) & mask;
+        else
+                ret = inb(0xA0) & (mask >> 8);
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+        return ret;
+}
+void make_8259A_irq(unsigned int irq)
+{
+        disable_irq_nosync(irq);
+        io_apic_irqs &= ~(1<<irq);
+        irq_desc[irq].handler = &i8259A_irq_type;
+        enable_irq(irq);
+}
+/*
+ * This function assumes to be called rarely. Switching between
+ * 8259A registers is slow.
+ * This has to be protected by the irq controller spinlock
+ * before being called.
+ */
+static inline int i8259A_irq_real(unsigned int irq)
+{
+        int value;
+        int irqmask = 1<<irq;
+        if (irq < 8) {
+                outb(0x0B,0x20);                /* ISR register */
+                value = inb(0x20) & irqmask;
+                outb(0x0A,0x20);                /* back to the IRR register */
+                return value;
+        }
+        outb(0x0B,0xA0);                /* ISR register */
+        value = inb(0xA0) & (irqmask >> 8);
+        outb(0x0A,0xA0);                /* back to the IRR register */
+        return value;
+}
+/*
+ * Careful! The 8259A is a fragile beast, it pretty
+ * much _has_ to be done exactly like this (mask it
+ * first, _then_ send the EOI, and the order of EOI
+ * to the two 8259s is important!
+ */
+static void mask_and_ack_8259A(unsigned int irq)
+{
+        unsigned int irqmask = 1 << irq;
+        unsigned long flags;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        /*
+         * Lightweight spurious IRQ detection. We do not want
+         * to overdo spurious IRQ handling - it's usually a sign
+         * of hardware problems, so we only do the checks we can
+         * do without slowing down good hardware unnecesserily.
+         *
+         * Note that IRQ7 and IRQ15 (the two spurious IRQs
+         * usually resulting from the 8259A-1|2 PICs) occur
+         * even if the IRQ is masked in the 8259A. Thus we
+         * can check spurious 8259A IRQs without doing the
+         * quite slow i8259A_irq_real() call for every IRQ.
+         * This does not cover 100% of spurious interrupts,
+         * but should be enough to warn the user that there
+         * is something bad going on ...
+         */
+        if (cached_irq_mask & irqmask)
+                goto spurious_8259A_irq;
+        cached_irq_mask |= irqmask;
+handle_real_irq:
+        if (irq & 8) {
+                inb(0xA1);              /* DUMMY - (do we need this?) */
+                outb(cached_A1,0xA1);
+                outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */
+                outb(0x62,0x20);        /* 'Specific EOI' to master-IRQ2 */
+        } else {
+                inb(0x21);              /* DUMMY - (do we need this?) */
+                outb(cached_21,0x21);
+                outb(0x60+irq,0x20);    /* 'Specific EOI' to master */
+        }
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+        return;
+spurious_8259A_irq:
+        /*
+         * this is the slow path - should happen rarely.
+         */
+        if (i8259A_irq_real(irq))
+                /*
+                 * oops, the IRQ _is_ in service according to the
+                 * 8259A - not spurious, go handle it.
+                 */
+                goto handle_real_irq;
+        {
+                static int spurious_irq_mask;
+                /*
+                 * At this point we can be sure the IRQ is spurious,
+                 * lets ACK and report it. [once per IRQ]
+                 */
+                if (!(spurious_irq_mask & irqmask)) {
+                        printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+                        spurious_irq_mask |= irqmask;
+                }
+                atomic_inc(&irq_err_count);
+                /*
+                 * Theoretically we do not have to handle this IRQ,
+                 * but in Linux this does not cause problems and is
+                 * simpler for us.
+                 */
+                goto handle_real_irq;
+        }
+}
+void init_8259A(int auto_eoi)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&i8259A_lock, flags);
+        outb(0xff, 0x21);       /* mask all of 8259A-1 */
+        outb(0xff, 0xA1);       /* mask all of 8259A-2 */
+        /*
+         * outb_p - this has to work on a wide range of PC hardware.
+         */
+        outb_p(0x11, 0x20);     /* ICW1: select 8259A-1 init */
+        outb_p(0x20 + 0, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
+        outb_p(0x04, 0x21);     /* 8259A-1 (the master) has a slave on IR2 */
+        if (auto_eoi)
+                outb_p(0x03, 0x21);     /* master does Auto EOI */
+        else
+                outb_p(0x01, 0x21);     /* master expects normal EOI */
+        outb_p(0x11, 0xA0);     /* ICW1: select 8259A-2 init */
+        outb_p(0x20 + 8, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
+        outb_p(0x02, 0xA1);     /* 8259A-2 is a slave on master's IR2 */
+        outb_p(0x01, 0xA1);     /* (slave's support for AEOI in flat mode
+                                    is to be investigated) */
+        if (auto_eoi)
+                /*
+                 * in AEOI mode we just have to mask the interrupt
+                 * when acking.
+                 */
+                i8259A_irq_type.ack = disable_8259A_irq;
+        else
+                i8259A_irq_type.ack = mask_and_ack_8259A;
+        udelay(100);            /* wait for 8259A to initialize */
+        outb(cached_21, 0x21);  /* restore master IRQ mask */
+        outb(cached_A1, 0xA1);  /* restore slave IRQ mask */
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+static char irq_trigger[2];
+/**
+ * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
+ */
+static void restore_ELCR(char *trigger)
+{
+        outb(trigger[0], 0x4d0);
+        outb(trigger[1], 0x4d1);
+}
+static void save_ELCR(char *trigger)
+{
+        /* IRQ 0,1,2,8,13 are marked as reserved */
+        trigger[0] = inb(0x4d0) & 0xF8;
+        trigger[1] = inb(0x4d1) & 0xDE;
+}
+static int i8259A_resume(struct sys_device *dev)
+{
+        init_8259A(0);
+        restore_ELCR(irq_trigger);
+        return 0;
+}
+static int i8259A_suspend(struct sys_device *dev, u32 state)
+{
+        save_ELCR(irq_trigger);
+        return 0;
+}
+static struct sysdev_class i8259_sysdev_class = {
+        set_kset_name("i8259"),
+        .suspend = i8259A_suspend,
+        .resume = i8259A_resume,
+};
+static struct sys_device device_i8259A = {
+        .id     = 0,
+        .cls    = &i8259_sysdev_class,
+};
+static int __init i8259A_init_sysfs(void)
+{
+        int error = sysdev_class_register(&i8259_sysdev_class);
+        if (!error)
+                error = sysdev_register(&device_i8259A);
+        return error;
+}
+device_initcall(i8259A_init_sysfs);
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+void __init init_ISA_irqs (void)
+{
+        int i;
+#ifdef CONFIG_X86_LOCAL_APIC
+        init_bsp_APIC();
+#endif
+        init_8259A(0);
+        for (i = 0; i < NR_IRQS; i++) {
+                irq_desc[i].status = IRQ_DISABLED;
+                irq_desc[i].action = NULL;
+                irq_desc[i].depth = 1;
+                if (i < 16) {
+                        /*
+                         * 16 old-style INTA-cycle interrupts:
+                         */
+                        irq_desc[i].handler = &i8259A_irq_type;
+                } else {
+                        /*
+                         * 'high' PCI IRQs filled in on demand
+                         */
+                        irq_desc[i].handler = &no_irq_type;
+                }
+        }
+}
+void apic_timer_interrupt(void);
+void spurious_interrupt(void);
+void error_interrupt(void);
+void reschedule_interrupt(void);
+void call_function_interrupt(void);
+void invalidate_interrupt(void);
+void thermal_interrupt(void);
+void i8254_timer_resume(void);
+static void setup_timer(void)
+{
+        outb_p(0x34,0x43);              /* binary, mode 2, LSB/MSB, ch 0 */
+        udelay(10);
+        outb_p(LATCH & 0xff , 0x40);    /* LSB */
+        udelay(10);
+        outb(LATCH >> 8 , 0x40);        /* MSB */
+}
+static int timer_resume(struct sys_device *dev)
+{
+        setup_timer();
+        return 0;
+}
+void i8254_timer_resume(void)
+{
+        setup_timer();
+}
+static struct sysdev_class timer_sysclass = {
+        set_kset_name("timer"),
+        .resume         = timer_resume,
+};
+static struct sys_device device_timer = {
+        .id             = 0,
+        .cls            = &timer_sysclass,
+};
+static int __init init_timer_sysfs(void)
+{
+        int error = sysdev_class_register(&timer_sysclass);
+        if (!error)
+                error = sysdev_register(&device_timer);
+        return error;
+}
+device_initcall(init_timer_sysfs);
+void __init init_IRQ(void)
+{
+        int i;
+        init_ISA_irqs();
+        /*
+         * Cover the whole vector space, no vector can escape
+         * us. (some of these will be overridden and become
+         * 'special' SMP interrupts)
+         */
+        for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+                int vector = FIRST_EXTERNAL_VECTOR + i;
+                if (i >= NR_IRQS)
+                        break;
+                if (vector != IA32_SYSCALL_VECTOR && vector != KDB_VECTOR) { 
+                        set_intr_gate(vector, interrupt[i]);
+        }
+        }
+#ifdef CONFIG_SMP
+        /*
+         * IRQ0 must be given a fixed assignment and initialized,
+         * because it's used before the IO-APIC is set up.
+         */
+        set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+        /*
+         * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+         * IPI, driven by wakeup.
+         */
+        set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+        /* IPI for invalidation */
+        set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+        /* IPI for generic function call */
+        set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+#endif  
+        set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#ifdef CONFIG_X86_LOCAL_APIC
+        /* self generated IPI for local APIC timer */
+        set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+        /* IPI vectors for APIC spurious and error interrupts */
+        set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+        set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+#endif
+        /*
+         * Set the clock to HZ Hz, we already have a valid
+         * vector now:
+         */
+        setup_timer();
+        if (!acpi_ioapic)
+                setup_irq(2, &irq2);
+}
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
new file mode 100644
index 000000000000..c4dc91491157
--- /dev/null
+++ b/arch/x86_64/kernel/init_task.c
@@ -0,0 +1,49 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+#include <linux/mqueue.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+static struct fs_struct init_fs = INIT_FS;
+static struct files_struct init_files = INIT_FILES;
+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+struct mm_struct init_mm = INIT_MM(init_mm);
+EXPORT_SYMBOL(init_mm);
+/*
+ * Initial task structure.
+ *
+ * We need to make sure that this is 8192-byte aligned due to the
+ * way process stacks are handled. This is done by having a special
+ * "init_task" linker map entry..
+ */
+union thread_union init_thread_union 
+        __attribute__((__section__(".data.init_task"))) =
+                { INIT_THREAD_INFO(init_task) };
+/*
+ * Initial task structure.
+ *
+ * All other task structs will be allocated on slabs in fork.c
+ */
+struct task_struct init_task = INIT_TASK(init_task);
+EXPORT_SYMBOL(init_task);
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data.cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */ 
+DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp;
+#define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
new file mode 100644
index 000000000000..adebe70c180e
--- /dev/null
+++ b/arch/x86_64/kernel/io_apic.c
@@ -0,0 +1,1982 @@
+/*
+ *      Intel IO-APIC support for multi-Pentium hosts.
+ *
+ *      Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ *
+ *      Many thanks to Stig Venaas for trying out countless experimental
+ *      patches and reporting/debugging problems patiently!
+ *
+ *      (c) 1999, Multiple IO-APIC support, developed by
+ *      Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ *      further tested and cleaned up by Zach Brown <zab@redhat.com>
+ *      and Ingo Molnar <mingo@redhat.com>
+ *
+ *      Fixes
+ *      Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
+ *                                      thanks to Eric Gilmore
+ *                                      and Rolf G. Tews
+ *                                      for testing these extensively
+ *      Paul Diefenbaugh        :       Added full ACPI support
+ */
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/config.h>
+#include <linux/smp_lock.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
+#include <linux/sysdev.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/mach_apic.h>
+#define __apicdebuginit  __init
+int sis_apic_bug; /* not actually supported, dummy for compile */
+static DEFINE_SPINLOCK(ioapic_lock);
+/*
+ * # of IRQ routing registers
+ */
+int nr_ioapic_registers[MAX_IO_APICS];
+/*
+ * Rough estimation of how many shared IRQs there are, can
+ * be changed anytime.
+ */
+#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+static struct irq_pin_list {
+        short apic, pin, next;
+} irq_2_pin[PIN_MAP_SIZE];
+int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+#ifdef CONFIG_PCI_MSI
+#define vector_to_irq(vector)   \
+        (platform_legacy_irq(vector) ? vector : vector_irq[vector])
+#else
+#define vector_to_irq(vector)   (vector)
+#endif
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+        static int first_free_entry = NR_IRQS;
+        struct irq_pin_list *entry = irq_2_pin + irq;
+        while (entry->next)
+                entry = irq_2_pin + entry->next;
+        if (entry->pin != -1) {
+                entry->next = first_free_entry;
+                entry = irq_2_pin + entry->next;
+                if (++first_free_entry >= PIN_MAP_SIZE)
+                        panic("io_apic.c: whoops");
+        }
+        entry->apic = apic;
+        entry->pin = pin;
+}
+#define __DO_ACTION(R, ACTION, FINAL)                                   \
+                                                                        \
+{                                                                       \
+        int pin;                                                        \
+        struct irq_pin_list *entry = irq_2_pin + irq;                   \
+                                                                        \
+        for (;;) {                                                      \
+                unsigned int reg;                                       \
+                pin = entry->pin;                                       \
+                if (pin == -1)                                          \
+                        break;                                          \
+                reg = io_apic_read(entry->apic, 0x10 + R + pin*2);      \
+                reg ACTION;                                             \
+                io_apic_modify(entry->apic, reg);                       \
+                if (!entry->next)                                       \
+                        break;                                          \
+                entry = irq_2_pin + entry->next;                        \
+        }                                                               \
+        FINAL;                                                          \
+}
+#define DO_ACTION(name,R,ACTION, FINAL)                                 \
+                                                                        \
+        static void name##_IO_APIC_irq (unsigned int irq)               \
+        __DO_ACTION(R, ACTION, FINAL)
+DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
+                                                /* mask = 1 */
+DO_ACTION( __unmask,           0, &= 0xfffeffff, )
+                                                /* mask = 0 */
+static void mask_IO_APIC_irq (unsigned int irq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        __mask_IO_APIC_irq(irq);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+static void unmask_IO_APIC_irq (unsigned int irq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        __unmask_IO_APIC_irq(irq);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+        struct IO_APIC_route_entry entry;
+        unsigned long flags;
+        /* Check delivery_mode to be sure we're not clearing an SMI pin */
+        spin_lock_irqsave(&ioapic_lock, flags);
+        *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
+        *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        if (entry.delivery_mode == dest_SMI)
+                return;
+        /*
+         * Disable it in the IO-APIC irq-routing table:
+         */
+        memset(&entry, 0, sizeof(entry));
+        entry.mask = 1;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
+        io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+static void clear_IO_APIC (void)
+{
+        int apic, pin;
+        for (apic = 0; apic < nr_ioapics; apic++)
+                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+                        clear_IO_APIC_pin(apic, pin);
+}
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+#define MAX_PIRQS 8
+static int pirq_entries [MAX_PIRQS];
+static int pirqs_enabled;
+int skip_ioapic_setup;
+int ioapic_force;
+/* dummy parsing: see setup.c */
+static int __init disable_ioapic_setup(char *str)
+{
+        skip_ioapic_setup = 1;
+        return 1;
+}
+static int __init enable_ioapic_setup(char *str)
+{
+        ioapic_force = 1;
+        skip_ioapic_setup = 0;
+        return 1;
+}
+__setup("noapic", disable_ioapic_setup);
+__setup("apic", enable_ioapic_setup);
+#include <asm/pci-direct.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
+   off. Check for an Nvidia or VIA PCI bridge and turn it off.
+   Use pci direct infrastructure because this runs before the PCI subsystem. 
+   Can be overwritten with "apic"
+   And another hack to disable the IOMMU on VIA chipsets.
+   Kludge-O-Rama. */
+void __init check_ioapic(void) 
+{ 
+        int num,slot,func; 
+        if (ioapic_force) 
+                return; 
+        /* Poor man's PCI discovery */
+        for (num = 0; num < 32; num++) { 
+                for (slot = 0; slot < 32; slot++) { 
+                        for (func = 0; func < 8; func++) { 
+                                u32 class;
+                                u32 vendor;
+                                u8 type;
+                                class = read_pci_config(num,slot,func,
+                                                        PCI_CLASS_REVISION);
+                                if (class == 0xffffffff)
+                                        break; 
+                                if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
+                                        continue; 
+                                vendor = read_pci_config(num, slot, func, 
+                                                         PCI_VENDOR_ID);
+                                vendor &= 0xffff;
+                                switch (vendor) { 
+                                case PCI_VENDOR_ID_VIA:
+#ifdef CONFIG_GART_IOMMU
+                                        if ((end_pfn >= (0xffffffff>>PAGE_SHIFT) ||
+                                             force_iommu) &&
+                                            !iommu_aperture_allowed) {
+                                                printk(KERN_INFO
+    "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n");
+                                                iommu_aperture_disabled = 1;
+                                        }
+#endif
+                                        return;
+                                case PCI_VENDOR_ID_NVIDIA:
+#ifdef CONFIG_ACPI
+                                        /* All timer overrides on Nvidia
+                                           seem to be wrong. Skip them. */
+                                        acpi_skip_timer_override = 1;
+                                        printk(KERN_INFO 
+             "Nvidia board detected. Ignoring ACPI timer override.\n");
+#endif
+                                        /* RED-PEN skip them on mptables too? */
+                                        return;
+                                } 
+                                /* No multi-function device? */
+                                type = read_pci_config_byte(num,slot,func,
+                                                            PCI_HEADER_TYPE);
+                                if (!(type & 0x80))
+                                        break;
+                        } 
+                }
+        }
+} 
+static int __init ioapic_pirq_setup(char *str)
+{
+        int i, max;
+        int ints[MAX_PIRQS+1];
+        get_options(str, ARRAY_SIZE(ints), ints);
+        for (i = 0; i < MAX_PIRQS; i++)
+                pirq_entries[i] = -1;
+        pirqs_enabled = 1;
+        apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
+        max = MAX_PIRQS;
+        if (ints[0] < MAX_PIRQS)
+                max = ints[0];
+        for (i = 0; i < max; i++) {
+                apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+                /*
+                 * PIRQs are mapped upside down, usually.
+                 */
+                pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+        }
+        return 1;
+}
+__setup("pirq=", ioapic_pirq_setup);
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int apic, int pin, int type)
+{
+        int i;
+        for (i = 0; i < mp_irq_entries; i++)
+                if (mp_irqs[i].mpc_irqtype == type &&
+                    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
+                     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
+                    mp_irqs[i].mpc_dstirq == pin)
+                        return i;
+        return -1;
+}
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+        int i;
+        for (i = 0; i < mp_irq_entries; i++) {
+                int lbus = mp_irqs[i].mpc_srcbus;
+                if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
+                     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
+                     mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
+                    (mp_irqs[i].mpc_irqtype == type) &&
+                    (mp_irqs[i].mpc_srcbusirq == irq))
+                        return mp_irqs[i].mpc_dstirq;
+        }
+        return -1;
+}
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+static int pin_2_irq(int idx, int apic, int pin);
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
+{
+        int apic, i, best_guess = -1;
+        apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+                bus, slot, pin);
+        if (mp_bus_id_to_pci_bus[bus] == -1) {
+                apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+                return -1;
+        }
+        for (i = 0; i < mp_irq_entries; i++) {
+                int lbus = mp_irqs[i].mpc_srcbus;
+                for (apic = 0; apic < nr_ioapics; apic++)
+                        if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+                            mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+                                break;
+                if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
+                    !mp_irqs[i].mpc_irqtype &&
+                    (bus == lbus) &&
+                    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
+                        int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+                        if (!(apic || IO_APIC_IRQ(irq)))
+                                continue;
+                        if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+                                return irq;
+                        /*
+                         * Use the first all-but-pin matching entry as a
+                         * best-guess fuzzy result for broken mptables.
+                         */
+                        if (best_guess < 0)
+                                best_guess = irq;
+                }
+        }
+        return best_guess;
+}
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+        if (irq < 16) {
+                unsigned int port = 0x4d0 + (irq >> 3);
+                return (inb(port) >> (irq & 7)) & 1;
+        }
+        apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
+        return 0;
+}
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value.  If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+#define default_EISA_trigger(idx)       (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_polarity(idx)      (0)
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+#define default_ISA_trigger(idx)        (0)
+#define default_ISA_polarity(idx)       (0)
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+#define default_PCI_trigger(idx)        (1)
+#define default_PCI_polarity(idx)       (1)
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+#define default_MCA_trigger(idx)        (1)
+#define default_MCA_polarity(idx)       (0)
+static int __init MPBIOS_polarity(int idx)
+{
+        int bus = mp_irqs[idx].mpc_srcbus;
+        int polarity;
+        /*
+         * Determine IRQ line polarity (high active or low active):
+         */
+        switch (mp_irqs[idx].mpc_irqflag & 3)
+        {
+                case 0: /* conforms, ie. bus-type dependent polarity */
+                {
+                        switch (mp_bus_id_to_type[bus])
+                        {
+                                case MP_BUS_ISA: /* ISA pin */
+                                {
+                                        polarity = default_ISA_polarity(idx);
+                                        break;
+                                }
+                                case MP_BUS_EISA: /* EISA pin */
+                                {
+                                        polarity = default_EISA_polarity(idx);
+                                        break;
+                                }
+                                case MP_BUS_PCI: /* PCI pin */
+                                {
+                                        polarity = default_PCI_polarity(idx);
+                                        break;
+                                }
+                                case MP_BUS_MCA: /* MCA pin */
+                                {
+                                        polarity = default_MCA_polarity(idx);
+                                        break;
+                                }
+                                default:
+                                {
+                                        printk(KERN_WARNING "broken BIOS!!\n");
+                                        polarity = 1;
+                                        break;
+                                }
+                        }
+                        break;
+                }
+                case 1: /* high active */
+                {
+                        polarity = 0;
+                        break;
+                }
+                case 2: /* reserved */
+                {
+                        printk(KERN_WARNING "broken BIOS!!\n");
+                        polarity = 1;
+                        break;
+                }
+                case 3: /* low active */
+                {
+                        polarity = 1;
+                        break;
+                }
+                default: /* invalid */
+                {
+                        printk(KERN_WARNING "broken BIOS!!\n");
+                        polarity = 1;
+                        break;
+                }
+        }
+        return polarity;
+}
+static int MPBIOS_trigger(int idx)
+{
+        int bus = mp_irqs[idx].mpc_srcbus;
+        int trigger;
+        /*
+         * Determine IRQ trigger mode (edge or level sensitive):
+         */
+        switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+        {
+                case 0: /* conforms, ie. bus-type dependent */
+                {
+                        switch (mp_bus_id_to_type[bus])
+                        {
+                                case MP_BUS_ISA: /* ISA pin */
+                                {
+                                        trigger = default_ISA_trigger(idx);
+                                        break;
+                                }
+                                case MP_BUS_EISA: /* EISA pin */
+                                {
+                                        trigger = default_EISA_trigger(idx);
+                                        break;
+                                }
+                                case MP_BUS_PCI: /* PCI pin */
+                                {
+                                        trigger = default_PCI_trigger(idx);
+                                        break;
+                                }
+                                case MP_BUS_MCA: /* MCA pin */
+                                {
+                                        trigger = default_MCA_trigger(idx);
+                                        break;
+                                }
+                                default:
+                                {
+                                        printk(KERN_WARNING "broken BIOS!!\n");
+                                        trigger = 1;
+                                        break;
+                                }
+                        }
+                        break;
+                }
+                case 1: /* edge */
+                {
+                        trigger = 0;
+                        break;
+                }
+                case 2: /* reserved */
+                {
+                        printk(KERN_WARNING "broken BIOS!!\n");
+                        trigger = 1;
+                        break;
+                }
+                case 3: /* level */
+                {
+                        trigger = 1;
+                        break;
+                }
+                default: /* invalid */
+                {
+                        printk(KERN_WARNING "broken BIOS!!\n");
+                        trigger = 0;
+                        break;
+                }
+        }
+        return trigger;
+}
+static inline int irq_polarity(int idx)
+{
+        return MPBIOS_polarity(idx);
+}
+static inline int irq_trigger(int idx)
+{
+        return MPBIOS_trigger(idx);
+}
+static int pin_2_irq(int idx, int apic, int pin)
+{
+        int irq, i;
+        int bus = mp_irqs[idx].mpc_srcbus;
+        /*
+         * Debugging check, we are in big trouble if this message pops up!
+         */
+        if (mp_irqs[idx].mpc_dstirq != pin)
+                printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+        switch (mp_bus_id_to_type[bus])
+        {
+                case MP_BUS_ISA: /* ISA pin */
+                case MP_BUS_EISA:
+                case MP_BUS_MCA:
+                {
+                        irq = mp_irqs[idx].mpc_srcbusirq;
+                        break;
+                }
+                case MP_BUS_PCI: /* PCI pin */
+                {
+                        /*
+                         * PCI IRQs are mapped in order
+                         */
+                        i = irq = 0;
+                        while (i < apic)
+                                irq += nr_ioapic_registers[i++];
+                        irq += pin;
+                        break;
+                }
+                default:
+                {
+                        printk(KERN_ERR "unknown bus type %d.\n",bus); 
+                        irq = 0;
+                        break;
+                }
+        }
+        /*
+         * PCI IRQ command line redirection. Yes, limits are hardcoded.
+         */
+        if ((pin >= 16) && (pin <= 23)) {
+                if (pirq_entries[pin-16] != -1) {
+                        if (!pirq_entries[pin-16]) {
+                                apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
+                        } else {
+                                irq = pirq_entries[pin-16];
+                                apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
+                                                pin-16, irq);
+                        }
+                }
+        }
+        return irq;
+}
+static inline int IO_APIC_irq_trigger(int irq)
+{
+        int apic, idx, pin;
+        for (apic = 0; apic < nr_ioapics; apic++) {
+                for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                        idx = find_irq_entry(apic,pin,mp_INT);
+                        if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+                                return irq_trigger(idx);
+                }
+        }
+        /*
+         * nonexistent IRQs are edge default
+         */
+        return 0;
+}
+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
+u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+int assign_irq_vector(int irq)
+{
+        static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
+        BUG_ON(irq >= NR_IRQ_VECTORS);
+        if (IO_APIC_VECTOR(irq) > 0)
+                return IO_APIC_VECTOR(irq);
+next:
+        current_vector += 8;
+        if (current_vector == IA32_SYSCALL_VECTOR)
+                goto next;
+        if (current_vector >= FIRST_SYSTEM_VECTOR) {
+                offset++;
+                if (!(offset%8))
+                        return -ENOSPC;
+                current_vector = FIRST_DEVICE_VECTOR + offset;
+        }
+        vector_irq[current_vector] = irq;
+        if (irq != AUTO_ASSIGN)
+                IO_APIC_VECTOR(irq) = current_vector;
+        return current_vector;
+}
+extern void (*interrupt[NR_IRQS])(void);
+static struct hw_interrupt_type ioapic_level_type;
+static struct hw_interrupt_type ioapic_edge_type;
+#define IOAPIC_AUTO     -1
+#define IOAPIC_EDGE     0
+#define IOAPIC_LEVEL    1
+static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+{
+        if (use_pci_vector() && !platform_legacy_irq(irq)) {
+                if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+                                trigger == IOAPIC_LEVEL)
+                        irq_desc[vector].handler = &ioapic_level_type;
+                else
+                        irq_desc[vector].handler = &ioapic_edge_type;
+                set_intr_gate(vector, interrupt[vector]);
+        } else  {
+                if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+                                trigger == IOAPIC_LEVEL)
+                        irq_desc[irq].handler = &ioapic_level_type;
+                else
+                        irq_desc[irq].handler = &ioapic_edge_type;
+                set_intr_gate(vector, interrupt[irq]);
+        }
+}
+static void __init setup_IO_APIC_irqs(void)
+{
+        struct IO_APIC_route_entry entry;
+        int apic, pin, idx, irq, first_notcon = 1, vector;
+        unsigned long flags;
+        apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+        for (apic = 0; apic < nr_ioapics; apic++) {
+        for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                /*
+                 * add it to the IO-APIC irq-routing table:
+                 */
+                memset(&entry,0,sizeof(entry));
+                entry.delivery_mode = INT_DELIVERY_MODE;
+                entry.dest_mode = INT_DEST_MODE;
+                entry.mask = 0;                         /* enable IRQ */
+                entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+                idx = find_irq_entry(apic,pin,mp_INT);
+                if (idx == -1) {
+                        if (first_notcon) {
+                                apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+                                first_notcon = 0;
+                        } else
+                                apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+                        continue;
+                }
+                entry.trigger = irq_trigger(idx);
+                entry.polarity = irq_polarity(idx);
+                if (irq_trigger(idx)) {
+                        entry.trigger = 1;
+                        entry.mask = 1;
+                        entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+                }
+                irq = pin_2_irq(idx, apic, pin);
+                add_pin_to_irq(irq, apic, pin);
+                if (!apic && !IO_APIC_IRQ(irq))
+                        continue;
+                if (IO_APIC_IRQ(irq)) {
+                        vector = assign_irq_vector(irq);
+                        entry.vector = vector;
+                        ioapic_register_intr(irq, vector, IOAPIC_AUTO);
+                        if (!apic && (irq < 16))
+                                disable_8259A_irq(irq);
+                }
+                spin_lock_irqsave(&ioapic_lock, flags);
+                io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
+                io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+        }
+        }
+        if (!first_notcon)
+                apic_printk(APIC_VERBOSE," not connected.\n");
+}
+/*
+ * Set up the 8259A-master output pin as broadcast to all
+ * CPUs.
+ */
+static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
+{
+        struct IO_APIC_route_entry entry;
+        unsigned long flags;
+        memset(&entry,0,sizeof(entry));
+        disable_8259A_irq(0);
+        /* mask LVT0 */
+        apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+        /*
+         * We use logical delivery to get the timer IRQ
+         * to the first CPU.
+         */
+        entry.dest_mode = INT_DEST_MODE;
+        entry.mask = 0;                                 /* unmask IRQ now */
+        entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+        entry.delivery_mode = INT_DELIVERY_MODE;
+        entry.polarity = 0;
+        entry.trigger = 0;
+        entry.vector = vector;
+        /*
+         * The timer IRQ doesn't have to know that behind the
+         * scene we have a 8259A-master in AEOI mode ...
+         */
+        irq_desc[0].handler = &ioapic_edge_type;
+        /*
+         * Add it to the IO-APIC irq-routing table:
+         */
+        spin_lock_irqsave(&ioapic_lock, flags);
+        io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+        io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        enable_8259A_irq(0);
+}
+void __init UNEXPECTED_IO_APIC(void)
+{
+}
+void __apicdebuginit print_IO_APIC(void)
+{
+        int apic, i;
+        union IO_APIC_reg_00 reg_00;
+        union IO_APIC_reg_01 reg_01;
+        union IO_APIC_reg_02 reg_02;
+        unsigned long flags;
+        if (apic_verbosity == APIC_QUIET)
+                return;
+        printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+        for (i = 0; i < nr_ioapics; i++)
+                printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+                       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+        /*
+         * We are a bit conservative about what we expect.  We have to
+         * know about every hardware change ASAP.
+         */
+        printk(KERN_INFO "testing the IO APIC.......................\n");
+        for (apic = 0; apic < nr_ioapics; apic++) {
+        spin_lock_irqsave(&ioapic_lock, flags);
+        reg_00.raw = io_apic_read(apic, 0);
+        reg_01.raw = io_apic_read(apic, 1);
+        if (reg_01.bits.version >= 0x10)
+                reg_02.raw = io_apic_read(apic, 2);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        printk("\n");
+        printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+        printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+        printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
+        if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
+                UNEXPECTED_IO_APIC();
+        printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
+        printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
+        if (    (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
+                (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
+                (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
+                (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
+                (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
+                (reg_01.bits.entries != 0x2E) &&
+                (reg_01.bits.entries != 0x3F) &&
+                (reg_01.bits.entries != 0x03) 
+        )
+                UNEXPECTED_IO_APIC();
+        printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
+        printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
+        if (    (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
+                (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
+                (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
+                (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
+                (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
+                (reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
+        )
+                UNEXPECTED_IO_APIC();
+        if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
+                UNEXPECTED_IO_APIC();
+        if (reg_01.bits.version >= 0x10) {
+                printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+                printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
+                if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
+                        UNEXPECTED_IO_APIC();
+        }
+        printk(KERN_DEBUG ".... IRQ redirection table:\n");
+        printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
+                          " Stat Dest Deli Vect:   \n");
+        for (i = 0; i <= reg_01.bits.entries; i++) {
+                struct IO_APIC_route_entry entry;
+                spin_lock_irqsave(&ioapic_lock, flags);
+                *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
+                *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+                printk(KERN_DEBUG " %02x %03X %02X  ",
+                        i,
+                        entry.dest.logical.logical_dest,
+                        entry.dest.physical.physical_dest
+                );
+                printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
+                        entry.mask,
+                        entry.trigger,
+                        entry.irr,
+                        entry.polarity,
+                        entry.delivery_status,
+                        entry.dest_mode,
+                        entry.delivery_mode,
+                        entry.vector
+                );
+        }
+        }
+        if (use_pci_vector())
+                printk(KERN_INFO "Using vector-based indexing\n");
+        printk(KERN_DEBUG "IRQ to pin mappings:\n");
+        for (i = 0; i < NR_IRQS; i++) {
+                struct irq_pin_list *entry = irq_2_pin + i;
+                if (entry->pin < 0)
+                        continue;
+                if (use_pci_vector() && !platform_legacy_irq(i))
+                        printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
+                else
+                        printk(KERN_DEBUG "IRQ%d ", i);
+                for (;;) {
+                        printk("-> %d:%d", entry->apic, entry->pin);
+                        if (!entry->next)
+                                break;
+                        entry = irq_2_pin + entry->next;
+                }
+                printk("\n");
+        }
+        printk(KERN_INFO ".................................... done.\n");
+        return;
+}
+#if 0
+static __apicdebuginit void print_APIC_bitfield (int base)
+{
+        unsigned int v;
+        int i, j;
+        if (apic_verbosity == APIC_QUIET)
+                return;
+        printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
+        for (i = 0; i < 8; i++) {
+                v = apic_read(base + i*0x10);
+                for (j = 0; j < 32; j++) {
+                        if (v & (1<<j))
+                                printk("1");
+                        else
+                                printk("0");
+                }
+                printk("\n");
+        }
+}
+void __apicdebuginit print_local_APIC(void * dummy)
+{
+        unsigned int v, ver, maxlvt;
+        if (apic_verbosity == APIC_QUIET)
+                return;
+        printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+                smp_processor_id(), hard_smp_processor_id());
+        v = apic_read(APIC_ID);
+        printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
+        v = apic_read(APIC_LVR);
+        printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+        ver = GET_APIC_VERSION(v);
+        maxlvt = get_maxlvt();
+        v = apic_read(APIC_TASKPRI);
+        printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+        if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
+                v = apic_read(APIC_ARBPRI);
+                printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+                        v & APIC_ARBPRI_MASK);
+                v = apic_read(APIC_PROCPRI);
+                printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+        }
+        v = apic_read(APIC_EOI);
+        printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+        v = apic_read(APIC_RRR);
+        printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+        v = apic_read(APIC_LDR);
+        printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+        v = apic_read(APIC_DFR);
+        printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+        v = apic_read(APIC_SPIV);
+        printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+        printk(KERN_DEBUG "... APIC ISR field:\n");
+        print_APIC_bitfield(APIC_ISR);
+        printk(KERN_DEBUG "... APIC TMR field:\n");
+        print_APIC_bitfield(APIC_TMR);
+        printk(KERN_DEBUG "... APIC IRR field:\n");
+        print_APIC_bitfield(APIC_IRR);
+        if (APIC_INTEGRATED(ver)) {             /* !82489DX */
+                if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
+                        apic_write(APIC_ESR, 0);
+                v = apic_read(APIC_ESR);
+                printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+        }
+        v = apic_read(APIC_ICR);
+        printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
+        v = apic_read(APIC_ICR2);
+        printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+        v = apic_read(APIC_LVTT);
+        printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+        if (maxlvt > 3) {                       /* PC is LVT#4. */
+                v = apic_read(APIC_LVTPC);
+                printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+        }
+        v = apic_read(APIC_LVT0);
+        printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+        v = apic_read(APIC_LVT1);
+        printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+        if (maxlvt > 2) {                       /* ERR is LVT#3. */
+                v = apic_read(APIC_LVTERR);
+                printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+        }
+        v = apic_read(APIC_TMICT);
+        printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+        v = apic_read(APIC_TMCCT);
+        printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+        v = apic_read(APIC_TDCR);
+        printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+        printk("\n");
+}
+void print_all_local_APICs (void)
+{
+        on_each_cpu(print_local_APIC, NULL, 1, 1);
+}
+void __apicdebuginit print_PIC(void)
+{
+        extern spinlock_t i8259A_lock;
+        unsigned int v;
+        unsigned long flags;
+        if (apic_verbosity == APIC_QUIET)
+                return;
+        printk(KERN_DEBUG "\nprinting PIC contents\n");
+        spin_lock_irqsave(&i8259A_lock, flags);
+        v = inb(0xa1) << 8 | inb(0x21);
+        printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
+        v = inb(0xa0) << 8 | inb(0x20);
+        printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
+        outb(0x0b,0xa0);
+        outb(0x0b,0x20);
+        v = inb(0xa0) << 8 | inb(0x20);
+        outb(0x0a,0xa0);
+        outb(0x0a,0x20);
+        spin_unlock_irqrestore(&i8259A_lock, flags);
+        printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
+        v = inb(0x4d1) << 8 | inb(0x4d0);
+        printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+#endif  /*  0  */
+static void __init enable_IO_APIC(void)
+{
+        union IO_APIC_reg_01 reg_01;
+        int i;
+        unsigned long flags;
+        for (i = 0; i < PIN_MAP_SIZE; i++) {
+                irq_2_pin[i].pin = -1;
+                irq_2_pin[i].next = 0;
+        }
+        if (!pirqs_enabled)
+                for (i = 0; i < MAX_PIRQS; i++)
+                        pirq_entries[i] = -1;
+        /*
+         * The number of IO-APIC IRQ registers (== #pins):
+         */
+        for (i = 0; i < nr_ioapics; i++) {
+                spin_lock_irqsave(&ioapic_lock, flags);
+                reg_01.raw = io_apic_read(i, 1);
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+                nr_ioapic_registers[i] = reg_01.bits.entries+1;
+        }
+        /*
+         * Do not trust the IO-APIC being empty at bootup
+         */
+        clear_IO_APIC();
+}
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+        /*
+         * Clear the IO-APIC before rebooting:
+         */
+        clear_IO_APIC();
+        disconnect_bsp_APIC();
+}
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
+ */
+static void __init setup_ioapic_ids_from_mpc (void)
+{
+        union IO_APIC_reg_00 reg_00;
+        int apic;
+        int i;
+        unsigned char old_id;
+        unsigned long flags;
+        /*
+         * Set the IOAPIC ID to the value stored in the MPC table.
+         */
+        for (apic = 0; apic < nr_ioapics; apic++) {
+                /* Read the register 0 value */
+                spin_lock_irqsave(&ioapic_lock, flags);
+                reg_00.raw = io_apic_read(apic, 0);
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+                
+                old_id = mp_ioapics[apic].mpc_apicid;
+                printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
+                /*
+                 * We need to adjust the IRQ routing table
+                 * if the ID changed.
+                 */
+                if (old_id != mp_ioapics[apic].mpc_apicid)
+                        for (i = 0; i < mp_irq_entries; i++)
+                                if (mp_irqs[i].mpc_dstapic == old_id)
+                                        mp_irqs[i].mpc_dstapic
+                                                = mp_ioapics[apic].mpc_apicid;
+                /*
+                 * Read the right value from the MPC table and
+                 * write it into the ID register.
+                 */
+                apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
+                                mp_ioapics[apic].mpc_apicid);
+                reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+                spin_lock_irqsave(&ioapic_lock, flags);
+                io_apic_write(apic, 0, reg_00.raw);
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+                /*
+                 * Sanity check
+                 */
+                spin_lock_irqsave(&ioapic_lock, flags);
+                reg_00.raw = io_apic_read(apic, 0);
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+                if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+                        printk("could not set ID!\n");
+                else
+                        apic_printk(APIC_VERBOSE," ok.\n");
+        }
+}
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ *      - timer IRQ defaults to IO-APIC IRQ
+ *      - if this function detects that timer IRQs are defunct, then we fall
+ *        back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+        unsigned long t1 = jiffies;
+        local_irq_enable();
+        /* Let ten ticks pass... */
+        mdelay((10 * 1000) / HZ);
+        /*
+         * Expect a few ticks at least, to be sure some possible
+         * glue logic does not lock up after one or two first
+         * ticks in a non-ExtINT mode.  Also the local APIC
+         * might have cached one ExtINT interrupt.  Finally, at
+         * least one tick may be lost due to delays.
+         */
+        /* jiffies wrap? */
+        if (jiffies - t1 > 4)
+                return 1;
+        return 0;
+}
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+/*
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ */
+static unsigned int startup_edge_ioapic_irq(unsigned int irq)
+{
+        int was_pending = 0;
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        if (irq < 16) {
+                disable_8259A_irq(irq);
+                if (i8259A_irq_pending(irq))
+                        was_pending = 1;
+        }
+        __unmask_IO_APIC_irq(irq);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return was_pending;
+}
+/*
+ * Once we have recorded IRQ_PENDING already, we can mask the
+ * interrupt for real. This prevents IRQ storms from unhandled
+ * devices.
+ */
+static void ack_edge_ioapic_irq(unsigned int irq)
+{
+        if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
+                                        == (IRQ_PENDING | IRQ_DISABLED))
+                mask_IO_APIC_irq(irq);
+        ack_APIC_irq();
+}
+/*
+ * Level triggered interrupts can just be masked,
+ * and shutting down and starting up the interrupt
+ * is the same as enabling and disabling them -- except
+ * with a startup need to return a "was pending" value.
+ *
+ * Level triggered interrupts are special because we
+ * do not touch any IO-APIC register while handling
+ * them. We ack the APIC in the end-IRQ handler, not
+ * in the start-IRQ-handler. Protection against reentrance
+ * from the same interrupt is still provided, both by the
+ * generic IRQ layer and by the fact that an unacked local
+ * APIC does not accept IRQs.
+ */
+static unsigned int startup_level_ioapic_irq (unsigned int irq)
+{
+        unmask_IO_APIC_irq(irq);
+        return 0; /* don't check for pending */
+}
+static void end_level_ioapic_irq (unsigned int irq)
+{
+        ack_APIC_irq();
+}
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+        unsigned long flags;
+        unsigned int dest;
+        dest = cpu_mask_to_apicid(mask);
+        /*
+         * Only the high 8 bits are valid.
+         */
+        dest = SET_APIC_LOGICAL_ID(dest);
+        spin_lock_irqsave(&ioapic_lock, flags);
+        __DO_ACTION(1, = dest, )
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+#ifdef CONFIG_PCI_MSI
+static unsigned int startup_edge_ioapic_vector(unsigned int vector)
+{
+        int irq = vector_to_irq(vector);
+        return startup_edge_ioapic_irq(irq);
+}
+static void ack_edge_ioapic_vector(unsigned int vector)
+{
+        int irq = vector_to_irq(vector);
+        ack_edge_ioapic_irq(irq);
+}
+static unsigned int startup_level_ioapic_vector (unsigned int vector)
+{
+        int irq = vector_to_irq(vector);
+        return startup_level_ioapic_irq (irq);
+}
+static void end_level_ioapic_vector (unsigned int vector)
+{
+        int irq = vector_to_irq(vector);
+        end_level_ioapic_irq(irq);
+}
+static void mask_IO_APIC_vector (unsigned int vector)
+{
+        int irq = vector_to_irq(vector);
+        mask_IO_APIC_irq(irq);
+}
+static void unmask_IO_APIC_vector (unsigned int vector)
+{
+        int irq = vector_to_irq(vector);
+        unmask_IO_APIC_irq(irq);
+}
+static void set_ioapic_affinity_vector (unsigned int vector,
+                                        cpumask_t cpu_mask)
+{
+        int irq = vector_to_irq(vector);
+        set_ioapic_affinity_irq(irq, cpu_mask);
+}
+#endif
+/*
+ * Level and edge triggered IO-APIC interrupts need different handling,
+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
+ * handled with the level-triggered descriptor, but that one has slightly
+ * more overhead. Level-triggered interrupts cannot be handled with the
+ * edge-triggered handler, without risking IRQ storms and other ugly
+ * races.
+ */
+static struct hw_interrupt_type ioapic_edge_type = {
+        .typename = "IO-APIC-edge",
+        .startup        = startup_edge_ioapic,
+        .shutdown       = shutdown_edge_ioapic,
+        .enable         = enable_edge_ioapic,
+        .disable        = disable_edge_ioapic,
+        .ack            = ack_edge_ioapic,
+        .end            = end_edge_ioapic,
+        .set_affinity = set_ioapic_affinity,
+};
+static struct hw_interrupt_type ioapic_level_type = {
+        .typename = "IO-APIC-level",
+        .startup        = startup_level_ioapic,
+        .shutdown       = shutdown_level_ioapic,
+        .enable         = enable_level_ioapic,
+        .disable        = disable_level_ioapic,
+        .ack            = mask_and_ack_level_ioapic,
+        .end            = end_level_ioapic,
+        .set_affinity = set_ioapic_affinity,
+};
+static inline void init_IO_APIC_traps(void)
+{
+        int irq;
+        /*
+         * NOTE! The local APIC isn't very good at handling
+         * multiple interrupts at the same interrupt level.
+         * As the interrupt level is determined by taking the
+         * vector number and shifting that right by 4, we
+         * want to spread these out a bit so that they don't
+         * all fall in the same interrupt level.
+         *
+         * Also, we've got to be careful not to trash gate
+         * 0x80, because int 0x80 is hm, kind of importantish. ;)
+         */
+        for (irq = 0; irq < NR_IRQS ; irq++) {
+                int tmp = irq;
+                if (use_pci_vector()) {
+                        if (!platform_legacy_irq(tmp))
+                                if ((tmp = vector_to_irq(tmp)) == -1)
+                                        continue;
+                }
+                if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
+                        /*
+                         * Hmm.. We don't have an entry for this,
+                         * so default to an old-fashioned 8259
+                         * interrupt if we can..
+                         */
+                        if (irq < 16)
+                                make_8259A_irq(irq);
+                        else
+                                /* Strange. Oh, well.. */
+                                irq_desc[irq].handler = &no_irq_type;
+                }
+        }
+}
+static void enable_lapic_irq (unsigned int irq)
+{
+        unsigned long v;
+        v = apic_read(APIC_LVT0);
+        apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+static void disable_lapic_irq (unsigned int irq)
+{
+        unsigned long v;
+        v = apic_read(APIC_LVT0);
+        apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+static void ack_lapic_irq (unsigned int irq)
+{
+        ack_APIC_irq();
+}
+static void end_lapic_irq (unsigned int i) { /* nothing */ }
+static struct hw_interrupt_type lapic_irq_type = {
+        .typename = "local-APIC-edge",
+        .startup = NULL, /* startup_irq() not used for IRQ0 */
+        .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
+        .enable = enable_lapic_irq,
+        .disable = disable_lapic_irq,
+        .ack = ack_lapic_irq,
+        .end = end_lapic_irq,
+};
+static void setup_nmi (void)
+{
+        /*
+         * Dirty trick to enable the NMI watchdog ...
+         * We put the 8259A master into AEOI mode and
+         * unmask on all local APICs LVT0 as NMI.
+         *
+         * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
+         * is from Maciej W. Rozycki - so we do not have to EOI from
+         * the NMI handler or the timer interrupt.
+         */ 
+        printk(KERN_INFO "activating NMI Watchdog ...");
+        enable_NMI_through_LVT0(NULL);
+        printk(" done.\n");
+}
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic.  ICR does
+ * not support the ExtINT mode, unfortunately.  We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA.  --macro
+ */
+static inline void unlock_ExtINT_logic(void)
+{
+        int pin, i;
+        struct IO_APIC_route_entry entry0, entry1;
+        unsigned char save_control, save_freq_select;
+        unsigned long flags;
+        pin = find_isa_irq_pin(8, mp_INT);
+        if (pin == -1)
+                return;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin);
+        *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        clear_IO_APIC_pin(0, pin);
+        memset(&entry1, 0, sizeof(entry1));
+        entry1.dest_mode = 0;                   /* physical delivery */
+        entry1.mask = 0;                        /* unmask IRQ now */
+        entry1.dest.physical.physical_dest = hard_smp_processor_id();
+        entry1.delivery_mode = dest_ExtINT;
+        entry1.polarity = entry0.polarity;
+        entry1.trigger = 0;
+        entry1.vector = 0;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
+        io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        save_control = CMOS_READ(RTC_CONTROL);
+        save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+        CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+                   RTC_FREQ_SELECT);
+        CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+        i = 100;
+        while (i-- > 0) {
+                mdelay(10);
+                if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+                        i -= 10;
+        }
+        CMOS_WRITE(save_control, RTC_CONTROL);
+        CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+        clear_IO_APIC_pin(0, pin);
+        spin_lock_irqsave(&ioapic_lock, flags);
+        io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
+        io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
+ * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ */
+static inline void check_timer(void)
+{
+        int pin1, pin2;
+        int vector;
+        /*
+         * get/set the timer IRQ vector:
+         */
+        disable_8259A_irq(0);
+        vector = assign_irq_vector(0);
+        set_intr_gate(vector, interrupt[0]);
+        /*
+         * Subtle, code in do_timer_interrupt() expects an AEOI
+         * mode for the 8259A whenever interrupts are routed
+         * through I/O APICs.  Also IRQ0 has to be enabled in
+         * the 8259A which implies the virtual wire has to be
+         * disabled in the local APIC.
+         */
+        apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+        init_8259A(1);
+        enable_8259A_irq(0);
+        pin1 = find_isa_irq_pin(0, mp_INT);
+        pin2 = find_isa_irq_pin(0, mp_ExtINT);
+        apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2);
+        if (pin1 != -1) {
+                /*
+                 * Ok, does IRQ0 through the IOAPIC work?
+                 */
+                unmask_IO_APIC_irq(0);
+                if (timer_irq_works()) {
+                        nmi_watchdog_default();
+                        if (nmi_watchdog == NMI_IO_APIC) {
+                                disable_8259A_irq(0);
+                                setup_nmi();
+                                enable_8259A_irq(0);
+                                check_nmi_watchdog();
+                        }
+                        return;
+                }
+                clear_IO_APIC_pin(0, pin1);
+                apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
+        }
+        apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
+        if (pin2 != -1) {
+                apic_printk(APIC_VERBOSE,"\n..... (found pin %d) ...", pin2);
+                /*
+                 * legacy devices should be connected to IO APIC #0
+                 */
+                setup_ExtINT_IRQ0_pin(pin2, vector);
+                if (timer_irq_works()) {
+                        printk("works.\n");
+                        nmi_watchdog_default();
+                        if (nmi_watchdog == NMI_IO_APIC) {
+                                setup_nmi();
+                                check_nmi_watchdog();
+                        }
+                        return;
+                }
+                /*
+                 * Cleanup, just in case ...
+                 */
+                clear_IO_APIC_pin(0, pin2);
+        }
+        printk(" failed.\n");
+        if (nmi_watchdog) {
+                printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+                nmi_watchdog = 0;
+        }
+        apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+        disable_8259A_irq(0);
+        irq_desc[0].handler = &lapic_irq_type;
+        apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);   /* Fixed mode */
+        enable_8259A_irq(0);
+        if (timer_irq_works()) {
+                apic_printk(APIC_QUIET, " works.\n");
+                return;
+        }
+        apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+        apic_printk(APIC_VERBOSE," failed.\n");
+        apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+        init_8259A(0);
+        make_8259A_irq(0);
+        apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+        unlock_ExtINT_logic();
+        if (timer_irq_works()) {
+                apic_printk(APIC_VERBOSE," works.\n");
+                return;
+        }
+        apic_printk(APIC_VERBOSE," failed :(.\n");
+        panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
+}
+/*
+ *
+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
+ *   Linux doesn't really care, as it's not actually used
+ *   for any interrupt handling anyway.
+ */
+#define PIC_IRQS        (1<<2)
+void __init setup_IO_APIC(void)
+{
+        enable_IO_APIC();
+        if (acpi_ioapic)
+                io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
+        else
+                io_apic_irqs = ~PIC_IRQS;
+        apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+        /*
+         * Set up the IO-APIC IRQ routing table.
+         */
+        if (!acpi_ioapic)
+                setup_ioapic_ids_from_mpc();
+        sync_Arb_IDs();
+        setup_IO_APIC_irqs();
+        init_IO_APIC_traps();
+        check_timer();
+        if (!acpi_ioapic)
+                print_IO_APIC();
+}
+struct sysfs_ioapic_data {
+        struct sys_device dev;
+        struct IO_APIC_route_entry entry[0];
+};
+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+static int ioapic_suspend(struct sys_device *dev, u32 state)
+{
+        struct IO_APIC_route_entry *entry;
+        struct sysfs_ioapic_data *data;
+        unsigned long flags;
+        int i;
+        data = container_of(dev, struct sysfs_ioapic_data, dev);
+        entry = data->entry;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
+                *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
+                *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
+        }
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return 0;
+}
+static int ioapic_resume(struct sys_device *dev)
+{
+        struct IO_APIC_route_entry *entry;
+        struct sysfs_ioapic_data *data;
+        unsigned long flags;
+        union IO_APIC_reg_00 reg_00;
+        int i;
+        data = container_of(dev, struct sysfs_ioapic_data, dev);
+        entry = data->entry;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        reg_00.raw = io_apic_read(dev->id, 0);
+        if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
+                reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+                io_apic_write(dev->id, 0, reg_00.raw);
+        }
+        for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
+                io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
+                io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
+        }
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return 0;
+}
+static struct sysdev_class ioapic_sysdev_class = {
+        set_kset_name("ioapic"),
+        .suspend = ioapic_suspend,
+        .resume = ioapic_resume,
+};
+static int __init ioapic_init_sysfs(void)
+{
+        struct sys_device * dev;
+        int i, size, error = 0;
+        error = sysdev_class_register(&ioapic_sysdev_class);
+        if (error)
+                return error;
+        for (i = 0; i < nr_ioapics; i++ ) {
+                size = sizeof(struct sys_device) + nr_ioapic_registers[i]
+                        * sizeof(struct IO_APIC_route_entry);
+                mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+                if (!mp_ioapic_data[i]) {
+                        printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+                        continue;
+                }
+                memset(mp_ioapic_data[i], 0, size);
+                dev = &mp_ioapic_data[i]->dev;
+                dev->id = i;
+                dev->cls = &ioapic_sysdev_class;
+                error = sysdev_register(dev);
+                if (error) {
+                        kfree(mp_ioapic_data[i]);
+                        mp_ioapic_data[i] = NULL;
+                        printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+                        continue;
+                }
+        }
+        return 0;
+}
+device_initcall(ioapic_init_sysfs);
+/* --------------------------------------------------------------------------
+                          ACPI-based IOAPIC Configuration
+   -------------------------------------------------------------------------- */
+#ifdef CONFIG_ACPI_BOOT
+#define IO_APIC_MAX_ID          0xFE
+int __init io_apic_get_unique_id (int ioapic, int apic_id)
+{
+        union IO_APIC_reg_00 reg_00;
+        static physid_mask_t apic_id_map;
+        unsigned long flags;
+        int i = 0;
+        /*
+         * The P4 platform supports up to 256 APIC IDs on two separate APIC 
+         * buses (one for LAPICs, one for IOAPICs), where predecessors only 
+         * supports up to 16 on one shared APIC bus.
+         * 
+         * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+         *      advantage of new APIC bus architecture.
+         */
+        if (physids_empty(apic_id_map))
+                apic_id_map = phys_cpu_present_map;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        reg_00.raw = io_apic_read(ioapic, 0);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        if (apic_id >= IO_APIC_MAX_ID) {
+                apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+                        "%d\n", ioapic, apic_id, reg_00.bits.ID);
+                apic_id = reg_00.bits.ID;
+        }
+        /*
+         * Every APIC in a system must have a unique ID or we get lots of nice 
+         * 'stuck on smp_invalidate_needed IPI wait' messages.
+         */
+        if (physid_isset(apic_id, apic_id_map)) {
+                for (i = 0; i < IO_APIC_MAX_ID; i++) {
+                        if (!physid_isset(i, apic_id_map))
+                                break;
+                }
+                if (i == IO_APIC_MAX_ID)
+                        panic("Max apic_id exceeded!\n");
+                apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+                        "trying %d\n", ioapic, apic_id, i);
+                apic_id = i;
+        } 
+        physid_set(apic_id, apic_id_map);
+        if (reg_00.bits.ID != apic_id) {
+                reg_00.bits.ID = apic_id;
+                spin_lock_irqsave(&ioapic_lock, flags);
+                io_apic_write(ioapic, 0, reg_00.raw);
+                reg_00.raw = io_apic_read(ioapic, 0);
+                spin_unlock_irqrestore(&ioapic_lock, flags);
+                /* Sanity check */
+                if (reg_00.bits.ID != apic_id)
+                        panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
+        }
+        apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+        return apic_id;
+}
+int __init io_apic_get_version (int ioapic)
+{
+        union IO_APIC_reg_01    reg_01;
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        reg_01.raw = io_apic_read(ioapic, 1);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return reg_01.bits.version;
+}
+int __init io_apic_get_redir_entries (int ioapic)
+{
+        union IO_APIC_reg_01    reg_01;
+        unsigned long flags;
+        spin_lock_irqsave(&ioapic_lock, flags);
+        reg_01.raw = io_apic_read(ioapic, 1);
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return reg_01.bits.entries;
+}
+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+{
+        struct IO_APIC_route_entry entry;
+        unsigned long flags;
+        if (!IO_APIC_IRQ(irq)) {
+                apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+                        ioapic);
+                return -EINVAL;
+        }
+        /*
+         * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
+         * Note that we mask (disable) IRQs now -- these get enabled when the
+         * corresponding device driver registers for this IRQ.
+         */
+        memset(&entry,0,sizeof(entry));
+        entry.delivery_mode = INT_DELIVERY_MODE;
+        entry.dest_mode = INT_DEST_MODE;
+        entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+        entry.trigger = edge_level;
+        entry.polarity = active_high_low;
+        entry.mask = 1;                                  /* Disabled (masked) */
+        /*
+         * IRQs < 16 are already in the irq_2_pin[] map
+         */
+        if (irq >= 16)
+                add_pin_to_irq(irq, ioapic, pin);
+        entry.vector = assign_irq_vector(irq);
+        apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
+                "IRQ %d Mode:%i Active:%i)\n", ioapic, 
+               mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+               edge_level, active_high_low);
+        ioapic_register_intr(irq, entry.vector, edge_level);
+        if (!ioapic && (irq < 16))
+                disable_8259A_irq(irq);
+        spin_lock_irqsave(&ioapic_lock, flags);
+        io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
+        io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+        spin_unlock_irqrestore(&ioapic_lock, flags);
+        return 0;
+}
+#endif /*CONFIG_ACPI_BOOT*/
+/*
+ * This function currently is only a helper for the i386 smp boot process where
+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
+ * so mask in all cases should simply be TARGET_CPUS
+ */
+void __init setup_ioapic_dest(void)
+{
+        int pin, ioapic, irq, irq_entry;
+        if (skip_ioapic_setup == 1)
+                return;
+        for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
+                for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+                        irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+                        if (irq_entry == -1)
+                                continue;
+                        irq = pin_2_irq(irq_entry, ioapic, pin);
+                        set_ioapic_affinity_irq(irq, TARGET_CPUS);
+                }
+        }
+}
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
new file mode 100644
index 000000000000..cde0e868efe3
--- /dev/null
+++ b/arch/x86_64/kernel/ioport.c
@@ -0,0 +1,117 @@
+/*
+ *      linux/arch/x86_64/kernel/ioport.c
+ *
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus.
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
+{
+        int i;
+                if (new_value)
+                for (i = base; i < base + extent; i++) 
+                        __set_bit(i, bitmap); 
+                else
+                for (i = base; i < base + extent; i++) 
+                        clear_bit(i, bitmap); 
+}
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+        unsigned int i, max_long, bytes, bytes_updated;
+        struct thread_struct * t = &current->thread;
+        struct tss_struct * tss;
+        unsigned long *bitmap;
+        if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+                return -EINVAL;
+        if (turn_on && !capable(CAP_SYS_RAWIO))
+                return -EPERM;
+        /*
+         * If it's the first ioperm() call in this thread's lifetime, set the
+         * IO bitmap up. ioperm() is much less timing critical than clone(),
+         * this is why we delay this operation until now:
+         */
+        if (!t->io_bitmap_ptr) {
+                bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+                if (!bitmap)
+                        return -ENOMEM;
+                memset(bitmap, 0xff, IO_BITMAP_BYTES);
+                t->io_bitmap_ptr = bitmap;
+        }
+        /*
+         * do it in the per-thread copy and in the TSS ...
+         *
+         * Disable preemption via get_cpu() - we must not switch away
+         * because the ->io_bitmap_max value must match the bitmap
+         * contents:
+         */
+        tss = &per_cpu(init_tss, get_cpu());
+        set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+        /*
+         * Search for a (possibly new) maximum. This is simple and stupid,
+         * to keep it obviously correct:
+         */
+        max_long = 0;
+        for (i = 0; i < IO_BITMAP_LONGS; i++)
+                if (t->io_bitmap_ptr[i] != ~0UL)
+                        max_long = i;
+        bytes = (max_long + 1) * sizeof(long);
+        bytes_updated = max(bytes, t->io_bitmap_max);
+        t->io_bitmap_max = bytes;
+        /* Update the TSS: */
+        memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+        put_cpu();
+        return 0;
+}
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ *
+ * Here we just change the eflags value on the stack: we allow
+ * only the super-user to do it. This depends on the stack-layout
+ * on system-call entry - see also fork() and the signal handling
+ * code.
+ */
+asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+{
+        unsigned int old = (regs->eflags >> 12) & 3;
+        if (level > 3)
+                return -EINVAL;
+        /* Trying to gain more privileges? */
+        if (level > old) {
+                if (!capable(CAP_SYS_RAWIO))
+                        return -EPERM;
+        }
+        regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
+        return 0;
+}
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
new file mode 100644
index 000000000000..62b112e4deb4
--- /dev/null
+++ b/arch/x86_64/kernel/irq.c
@@ -0,0 +1,108 @@
+/*
+ *      linux/arch/x86_64/kernel/irq.c
+ *
+ *      Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86_64-specific interrupt
+ * entry and irq statistics code. All the remaining irq logic is
+ * done by the generic kernel/irq/ code and in the
+ * x86_64-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/io_apic.h>
+atomic_t irq_err_count;
+#ifdef CONFIG_X86_IO_APIC
+#ifdef APIC_MISMATCH_DEBUG
+atomic_t irq_mis_count;
+#endif
+#endif
+/*
+ * Generic, controller-independent functions:
+ */
+int show_interrupts(struct seq_file *p, void *v)
+{
+        int i = *(loff_t *) v, j;
+        struct irqaction * action;
+        unsigned long flags;
+        if (i == 0) {
+                seq_printf(p, "           ");
+                for (j=0; j<NR_CPUS; j++)
+                        if (cpu_online(j))
+                                seq_printf(p, "CPU%d       ",j);
+                seq_putc(p, '\n');
+        }
+        if (i < NR_IRQS) {
+                spin_lock_irqsave(&irq_desc[i].lock, flags);
+                action = irq_desc[i].action;
+                if (!action) 
+                        goto skip;
+                seq_printf(p, "%3d: ",i);
+#ifndef CONFIG_SMP
+                seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+                for (j=0; j<NR_CPUS; j++)
+                        if (cpu_online(j))
+                        seq_printf(p, "%10u ",
+                                kstat_cpu(j).irqs[i]);
+#endif
+                seq_printf(p, " %14s", irq_desc[i].handler->typename);
+                seq_printf(p, "  %s", action->name);
+                for (action=action->next; action; action = action->next)
+                        seq_printf(p, ", %s", action->name);
+                seq_putc(p, '\n');
+skip:
+                spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+        } else if (i == NR_IRQS) {
+                seq_printf(p, "NMI: ");
+                for (j = 0; j < NR_CPUS; j++)
+                        if (cpu_online(j))
+                                seq_printf(p, "%10u ", cpu_pda[j].__nmi_count);
+                seq_putc(p, '\n');
+#ifdef CONFIG_X86_LOCAL_APIC
+                seq_printf(p, "LOC: ");
+                for (j = 0; j < NR_CPUS; j++)
+                        if (cpu_online(j))
+                                seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs);
+                seq_putc(p, '\n');
+#endif
+                seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#ifdef CONFIG_X86_IO_APIC
+#ifdef APIC_MISMATCH_DEBUG
+                seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+#endif
+        }
+        return 0;
+}
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+{       
+        /* high bits used in ret_from_ code  */
+        unsigned irq = regs->orig_rax & 0xff;
+        irq_enter();
+        BUG_ON(irq > 256);
+        __do_IRQ(irq, regs);
+        irq_exit();
+        return 1;
+}
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
new file mode 100644
index 000000000000..4f2a852299b6
--- /dev/null
+++ b/arch/x86_64/kernel/kprobes.c
@@ -0,0 +1,631 @@
+/*
+ *  Kernel Probes (KProbes)
+ *  arch/x86_64/kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct     Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *              Probes initial implementation ( includes contributions from
+ *              Rusty Russell).
+ * 2004-July    Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *              interface to access function arguments.
+ * 2004-Oct     Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
+ *              <prasanna@in.ibm.com> adapted for x86_64
+ * 2005-Mar     Roland McGrath <roland@redhat.com>
+ *              Fixed to handle %rip-relative addressing mode correctly.
+ */
+#include <linux/config.h>
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/moduleloader.h>
+#include <asm/pgtable.h>
+#include <asm/kdebug.h>
+static DECLARE_MUTEX(kprobe_mutex);
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE       0x00000001
+#define KPROBE_HIT_SS           0x00000002
+static struct kprobe *current_kprobe;
+static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags;
+static struct pt_regs jprobe_saved_regs;
+static long *jprobe_saved_rsp;
+static kprobe_opcode_t *get_insn_slot(void);
+static void free_insn_slot(kprobe_opcode_t *slot);
+void jprobe_return_end(void);
+/* copy of the kernel stack at the probe fire time */
+static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
+/*
+ * returns non-zero if opcode modifies the interrupt flag.
+ */
+static inline int is_IF_modifier(kprobe_opcode_t *insn)
+{
+        switch (*insn) {
+        case 0xfa:              /* cli */
+        case 0xfb:              /* sti */
+        case 0xcf:              /* iret/iretd */
+        case 0x9d:              /* popf/popfd */
+                return 1;
+        }
+        if (*insn  >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
+                return 1;
+        return 0;
+}
+int arch_prepare_kprobe(struct kprobe *p)
+{
+        /* insn: must be on special executable page on x86_64. */
+        up(&kprobe_mutex);
+        p->ainsn.insn = get_insn_slot();
+        down(&kprobe_mutex);
+        if (!p->ainsn.insn) {
+                return -ENOMEM;
+        }
+        return 0;
+}
+/*
+ * Determine if the instruction uses the %rip-relative addressing mode.
+ * If it does, return the address of the 32-bit displacement word.
+ * If not, return null.
+ */
+static inline s32 *is_riprel(u8 *insn)
+{
+#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)                \
+        (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
+          (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
+          (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
+          (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
+         << (row % 64))
+        static const u64 onebyte_has_modrm[256 / 64] = {
+                /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+                /*      -------------------------------         */
+                W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
+                W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
+                W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
+                W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
+                W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
+                W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
+                W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
+                W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
+                W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
+                W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
+                W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
+                W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
+                W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
+                W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
+                W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
+                W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1)  /* f0 */
+                /*      -------------------------------         */
+                /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+        };
+        static const u64 twobyte_has_modrm[256 / 64] = {
+                /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+                /*      -------------------------------         */
+                W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
+                W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
+                W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
+                W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
+                W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
+                W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
+                W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
+                W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
+                W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
+                W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
+                W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
+                W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
+                W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
+                W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
+                W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
+                W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0)  /* ff */
+                /*      -------------------------------         */
+                /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+        };
+#undef  W
+        int need_modrm;
+        /* Skip legacy instruction prefixes.  */
+        while (1) {
+                switch (*insn) {
+                case 0x66:
+                case 0x67:
+                case 0x2e:
+                case 0x3e:
+                case 0x26:
+                case 0x64:
+                case 0x65:
+                case 0x36:
+                case 0xf0:
+                case 0xf3:
+                case 0xf2:
+                        ++insn;
+                        continue;
+                }
+                break;
+        }
+        /* Skip REX instruction prefix.  */
+        if ((*insn & 0xf0) == 0x40)
+                ++insn;
+        if (*insn == 0x0f) {    /* Two-byte opcode.  */
+                ++insn;
+                need_modrm = test_bit(*insn, twobyte_has_modrm);
+        } else {                /* One-byte opcode.  */
+                need_modrm = test_bit(*insn, onebyte_has_modrm);
+        }
+        if (need_modrm) {
+                u8 modrm = *++insn;
+                if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
+                        /* Displacement follows ModRM byte.  */
+                        return (s32 *) ++insn;
+                }
+        }
+        /* No %rip-relative addressing mode here.  */
+        return NULL;
+}
+void arch_copy_kprobe(struct kprobe *p)
+{
+        s32 *ripdisp;
+        memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
+        ripdisp = is_riprel(p->ainsn.insn);
+        if (ripdisp) {
+                /*
+                 * The copied instruction uses the %rip-relative
+                 * addressing mode.  Adjust the displacement for the
+                 * difference between the original location of this
+                 * instruction and the location of the copy that will
+                 * actually be run.  The tricky bit here is making sure
+                 * that the sign extension happens correctly in this
+                 * calculation, since we need a signed 32-bit result to
+                 * be sign-extended to 64 bits when it's added to the
+                 * %rip value and yield the same 64-bit result that the
+                 * sign-extension of the original signed 32-bit
+                 * displacement would have given.
+                 */
+                s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
+                BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
+                *ripdisp = disp;
+        }
+}
+void arch_remove_kprobe(struct kprobe *p)
+{
+        up(&kprobe_mutex);
+        free_insn_slot(p->ainsn.insn);
+        down(&kprobe_mutex);
+}
+static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+        *p->addr = p->opcode;
+        regs->rip = (unsigned long)p->addr;
+}
+static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+        regs->eflags |= TF_MASK;
+        regs->eflags &= ~IF_MASK;
+        /*single step inline if the instruction is an int3*/
+        if (p->opcode == BREAKPOINT_INSTRUCTION)
+                regs->rip = (unsigned long)p->addr;
+        else
+                regs->rip = (unsigned long)p->ainsn.insn;
+}
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+int kprobe_handler(struct pt_regs *regs)
+{
+        struct kprobe *p;
+        int ret = 0;
+        kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
+        /* We're in an interrupt, but this is clear and BUG()-safe. */
+        preempt_disable();
+        /* Check we're not actually recursing */
+        if (kprobe_running()) {
+                /* We *are* holding lock here, so this is safe.
+                   Disarm the probe we just hit, and ignore it. */
+                p = get_kprobe(addr);
+                if (p) {
+                        if (kprobe_status == KPROBE_HIT_SS) {
+                                regs->eflags &= ~TF_MASK;
+                                regs->eflags |= kprobe_saved_rflags;
+                                unlock_kprobes();
+                                goto no_kprobe;
+                        }
+                        disarm_kprobe(p, regs);
+                        ret = 1;
+                } else {
+                        p = current_kprobe;
+                        if (p->break_handler && p->break_handler(p, regs)) {
+                                goto ss_probe;
+                        }
+                }
+                /* If it's not ours, can't be delete race, (we hold lock). */
+                goto no_kprobe;
+        }
+        lock_kprobes();
+        p = get_kprobe(addr);
+        if (!p) {
+                unlock_kprobes();
+                if (*addr != BREAKPOINT_INSTRUCTION) {
+                        /*
+                         * The breakpoint instruction was removed right
+                         * after we hit it.  Another cpu has removed
+                         * either a probepoint or a debugger breakpoint
+                         * at this address.  In either case, no further
+                         * handling of this interrupt is appropriate.
+                         */
+                        ret = 1;
+                }
+                /* Not one of ours: let kernel handle it */
+                goto no_kprobe;
+        }
+        kprobe_status = KPROBE_HIT_ACTIVE;
+        current_kprobe = p;
+        kprobe_saved_rflags = kprobe_old_rflags
+            = (regs->eflags & (TF_MASK | IF_MASK));
+        if (is_IF_modifier(p->ainsn.insn))
+                kprobe_saved_rflags &= ~IF_MASK;
+        if (p->pre_handler && p->pre_handler(p, regs))
+                /* handler has already set things up, so skip ss setup */
+                return 1;
+ss_probe:
+        prepare_singlestep(p, regs);
+        kprobe_status = KPROBE_HIT_SS;
+        return 1;
+no_kprobe:
+        preempt_enable_no_resched();
+        return ret;
+}
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int 3"
+ * instruction.  To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction.  The address of this
+ * copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * interrupt.  We have to fix up the stack as follows:
+ *
+ * 0) Except in the case of absolute or indirect jump or call instructions,
+ * the new rip is relative to the copied instruction.  We need to make
+ * it relative to the original instruction.
+ *
+ * 1) If the single-stepped instruction was pushfl, then the TF and IF
+ * flags are set in the just-pushed eflags, and may need to be cleared.
+ *
+ * 2) If the single-stepped instruction was a call, the return address
+ * that is atop the stack is the address following the copied instruction.
+ * We need to make it the address following the original instruction.
+ */
+static void resume_execution(struct kprobe *p, struct pt_regs *regs)
+{
+        unsigned long *tos = (unsigned long *)regs->rsp;
+        unsigned long next_rip = 0;
+        unsigned long copy_rip = (unsigned long)p->ainsn.insn;
+        unsigned long orig_rip = (unsigned long)p->addr;
+        kprobe_opcode_t *insn = p->ainsn.insn;
+        /*skip the REX prefix*/
+        if (*insn >= 0x40 && *insn <= 0x4f)
+                insn++;
+        switch (*insn) {
+        case 0x9c:              /* pushfl */
+                *tos &= ~(TF_MASK | IF_MASK);
+                *tos |= kprobe_old_rflags;
+                break;
+        case 0xe8:              /* call relative - Fix return addr */
+                *tos = orig_rip + (*tos - copy_rip);
+                break;
+        case 0xff:
+                if ((*insn & 0x30) == 0x10) {
+                        /* call absolute, indirect */
+                        /* Fix return addr; rip is correct. */
+                        next_rip = regs->rip;
+                        *tos = orig_rip + (*tos - copy_rip);
+                } else if (((*insn & 0x31) == 0x20) ||  /* jmp near, absolute indirect */
+                           ((*insn & 0x31) == 0x21)) {  /* jmp far, absolute indirect */
+                        /* rip is correct. */
+                        next_rip = regs->rip;
+                }
+                break;
+        case 0xea:              /* jmp absolute -- rip is correct */
+                next_rip = regs->rip;
+                break;
+        default:
+                break;
+        }
+        regs->eflags &= ~TF_MASK;
+        if (next_rip) {
+                regs->rip = next_rip;
+        } else {
+                regs->rip = orig_rip + (regs->rip - copy_rip);
+        }
+}
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thoroughout this function.  And we hold kprobe lock.
+ */
+int post_kprobe_handler(struct pt_regs *regs)
+{
+        if (!kprobe_running())
+                return 0;
+        if (current_kprobe->post_handler)
+                current_kprobe->post_handler(current_kprobe, regs, 0);
+        resume_execution(current_kprobe, regs);
+        regs->eflags |= kprobe_saved_rflags;
+        unlock_kprobes();
+        preempt_enable_no_resched();
+        /*
+         * if somebody else is singlestepping across a probe point, eflags
+         * will have TF set, in which case, continue the remaining processing
+         * of do_debug, as if this is not a probe hit.
+         */
+        if (regs->eflags & TF_MASK)
+                return 0;
+        return 1;
+}
+/* Interrupts disabled, kprobe_lock held. */
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+        if (current_kprobe->fault_handler
+            && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
+                return 1;
+        if (kprobe_status & KPROBE_HIT_SS) {
+                resume_execution(current_kprobe, regs);
+                regs->eflags |= kprobe_old_rflags;
+                unlock_kprobes();
+                preempt_enable_no_resched();
+        }
+        return 0;
+}
+/*
+ * Wrapper routine for handling exceptions.
+ */
+int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
+                             void *data)
+{
+        struct die_args *args = (struct die_args *)data;
+        switch (val) {
+        case DIE_INT3:
+                if (kprobe_handler(args->regs))
+                        return NOTIFY_STOP;
+                break;
+        case DIE_DEBUG:
+                if (post_kprobe_handler(args->regs))
+                        return NOTIFY_STOP;
+                break;
+        case DIE_GPF:
+                if (kprobe_running() &&
+                    kprobe_fault_handler(args->regs, args->trapnr))
+                        return NOTIFY_STOP;
+                break;
+        case DIE_PAGE_FAULT:
+                if (kprobe_running() &&
+                    kprobe_fault_handler(args->regs, args->trapnr))
+                        return NOTIFY_STOP;
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_DONE;
+}
+int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+        struct jprobe *jp = container_of(p, struct jprobe, kp);
+        unsigned long addr;
+        jprobe_saved_regs = *regs;
+        jprobe_saved_rsp = (long *) regs->rsp;
+        addr = (unsigned long)jprobe_saved_rsp;
+        /*
+         * As Linus pointed out, gcc assumes that the callee
+         * owns the argument space and could overwrite it, e.g.
+         * tailcall optimization. So, to be absolutely safe
+         * we also save and restore enough stack bytes to cover
+         * the argument area.
+         */
+        memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr));
+        regs->eflags &= ~IF_MASK;
+        regs->rip = (unsigned long)(jp->entry);
+        return 1;
+}
+void jprobe_return(void)
+{
+        preempt_enable_no_resched();
+        asm volatile ("       xchg   %%rbx,%%rsp     \n"
+                      "       int3                      \n"
+                      "       .globl jprobe_return_end  \n"
+                      "       jprobe_return_end:        \n"
+                      "       nop                       \n"::"b"
+                      (jprobe_saved_rsp):"memory");
+}
+int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+        u8 *addr = (u8 *) (regs->rip - 1);
+        unsigned long stack_addr = (unsigned long)jprobe_saved_rsp;
+        struct jprobe *jp = container_of(p, struct jprobe, kp);
+        if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
+                if ((long *)regs->rsp != jprobe_saved_rsp) {
+                        struct pt_regs *saved_regs =
+                            container_of(jprobe_saved_rsp, struct pt_regs, rsp);
+                        printk("current rsp %p does not match saved rsp %p\n",
+                               (long *)regs->rsp, jprobe_saved_rsp);
+                        printk("Saved registers for jprobe %p\n", jp);
+                        show_registers(saved_regs);
+                        printk("Current registers\n");
+                        show_registers(regs);
+                        BUG();
+                }
+                *regs = jprobe_saved_regs;
+                memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack,
+                       MIN_STACK_SIZE(stack_addr));
+                return 1;
+        }
+        return 0;
+}
+/*
+ * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped.
+ * By default on x86_64, pages we get from kmalloc or vmalloc are not
+ * executable.  Single-stepping an instruction on such a page yields an
+ * oops.  So instead of storing the instruction copies in their respective
+ * kprobe objects, we allocate a page, map it executable, and store all the
+ * instruction copies there.  (We can allocate additional pages if somebody
+ * inserts a huge number of probes.)  Each page can hold up to INSNS_PER_PAGE
+ * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t)
+ * bytes.
+ */
+#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t)))
+struct kprobe_insn_page {
+        struct hlist_node hlist;
+        kprobe_opcode_t *insns;         /* page of instruction slots */
+        char slot_used[INSNS_PER_PAGE];
+        int nused;
+};
+static struct hlist_head kprobe_insn_pages;
+/**
+ * get_insn_slot() - Find a slot on an executable page for an instruction.
+ * We allocate an executable page if there's no room on existing ones.
+ */
+static kprobe_opcode_t *get_insn_slot(void)
+{
+        struct kprobe_insn_page *kip;
+        struct hlist_node *pos;
+        hlist_for_each(pos, &kprobe_insn_pages) {
+                kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+                if (kip->nused < INSNS_PER_PAGE) {
+                        int i;
+                        for (i = 0; i < INSNS_PER_PAGE; i++) {
+                                if (!kip->slot_used[i]) {
+                                        kip->slot_used[i] = 1;
+                                        kip->nused++;
+                                        return kip->insns + (i*MAX_INSN_SIZE);
+                                }
+                        }
+                        /* Surprise!  No unused slots.  Fix kip->nused. */
+                        kip->nused = INSNS_PER_PAGE;
+                }
+        }
+        /* All out of space.  Need to allocate a new page. Use slot 0.*/
+        kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
+        if (!kip) {
+                return NULL;
+        }
+        /*
+         * For the %rip-relative displacement fixups to be doable, we
+         * need our instruction copy to be within +/- 2GB of any data it
+         * might access via %rip.  That is, within 2GB of where the
+         * kernel image and loaded module images reside.  So we allocate
+         * a page in the module loading area.
+         */
+        kip->insns = module_alloc(PAGE_SIZE);
+        if (!kip->insns) {
+                kfree(kip);
+                return NULL;
+        }
+        INIT_HLIST_NODE(&kip->hlist);
+        hlist_add_head(&kip->hlist, &kprobe_insn_pages);
+        memset(kip->slot_used, 0, INSNS_PER_PAGE);
+        kip->slot_used[0] = 1;
+        kip->nused = 1;
+        return kip->insns;
+}
+/**
+ * free_insn_slot() - Free instruction slot obtained from get_insn_slot().
+ */
+static void free_insn_slot(kprobe_opcode_t *slot)
+{
+        struct kprobe_insn_page *kip;
+        struct hlist_node *pos;
+        hlist_for_each(pos, &kprobe_insn_pages) {
+                kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+                if (kip->insns <= slot
+                    && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) {
+                        int i = (slot - kip->insns) / MAX_INSN_SIZE;
+                        kip->slot_used[i] = 0;
+                        kip->nused--;
+                        if (kip->nused == 0) {
+                                /*
+                                 * Page is no longer in use.  Free it unless
+                                 * it's the last one.  We keep the last one
+                                 * so as not to have to set it up again the
+                                 * next time somebody inserts a probe.
+                                 */
+                                hlist_del(&kip->hlist);
+                                if (hlist_empty(&kprobe_insn_pages)) {
+                                        INIT_HLIST_NODE(&kip->hlist);
+                                        hlist_add_head(&kip->hlist,
+                                                &kprobe_insn_pages);
+                                } else {
+                                        module_free(NULL, kip->insns);
+                                        kfree(kip);
+                                }
+                        }
+                        return;
+                }
+        }
+}
diff --git a/arch/x86_64/kernel/ldt.c b/arch/x86_64/kernel/ldt.c
new file mode 100644
index 000000000000..d7e5d0cf4285
--- /dev/null
+++ b/arch/x86_64/kernel/ldt.c
@@ -0,0 +1,253 @@
+/*
+ * linux/arch/x86_64/kernel/ldt.c
+ *
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Andi Kleen
+ * 
+ * This handles calls from both 32bit and 64bit mode.
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+static void flush_ldt(void *null)
+{
+        if (current->active_mm)
+               load_LDT(&current->active_mm->context);
+}
+#endif
+static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
+{
+        void *oldldt;
+        void *newldt;
+        unsigned oldsize;
+        if (mincount <= (unsigned)pc->size)
+                return 0;
+        oldsize = pc->size;
+        mincount = (mincount+511)&(~511);
+        if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
+                newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+        else
+                newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+        if (!newldt)
+                return -ENOMEM;
+        if (oldsize)
+                memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+        oldldt = pc->ldt;
+        memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
+        wmb();
+        pc->ldt = newldt;
+        wmb();
+        pc->size = mincount;
+        wmb();
+        if (reload) {
+#ifdef CONFIG_SMP
+                cpumask_t mask;
+                preempt_disable();
+                mask = cpumask_of_cpu(smp_processor_id());
+                load_LDT(pc);
+                if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+                        smp_call_function(flush_ldt, NULL, 1, 1);
+                preempt_enable();
+#else
+                load_LDT(pc);
+#endif
+        }
+        if (oldsize) {
+                if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+                        vfree(oldldt);
+                else
+                        kfree(oldldt);
+        }
+        return 0;
+}
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+        int err = alloc_ldt(new, old->size, 0);
+        if (err < 0)
+                return err;
+        memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+        return 0;
+}
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+        struct mm_struct * old_mm;
+        int retval = 0;
+        init_MUTEX(&mm->context.sem);
+        mm->context.size = 0;
+        old_mm = current->mm;
+        if (old_mm && old_mm->context.size > 0) {
+                down(&old_mm->context.sem);
+                retval = copy_ldt(&mm->context, &old_mm->context);
+                up(&old_mm->context.sem);
+        }
+        return retval;
+}
+/*
+ * 
+ * Don't touch the LDT register - we're already in the next thread.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+        if (mm->context.size) {
+                if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+                        vfree(mm->context.ldt);
+                else
+                        kfree(mm->context.ldt);
+                mm->context.size = 0;
+        }
+}
+static int read_ldt(void __user * ptr, unsigned long bytecount)
+{
+        int err;
+        unsigned long size;
+        struct mm_struct * mm = current->mm;
+        if (!mm->context.size)
+                return 0;
+        if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
+                bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+        down(&mm->context.sem);
+        size = mm->context.size*LDT_ENTRY_SIZE;
+        if (size > bytecount)
+                size = bytecount;
+        err = 0;
+        if (copy_to_user(ptr, mm->context.ldt, size))
+                err = -EFAULT;
+        up(&mm->context.sem);
+        if (err < 0)
+                goto error_return;
+        if (size != bytecount) {
+                /* zero-fill the rest */
+                if (clear_user(ptr+size, bytecount-size) != 0) {
+                        err = -EFAULT;
+                        goto error_return;
+                }
+        }
+        return bytecount;
+error_return:
+        return err;
+}
+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+{
+        /* Arbitrary number */ 
+        /* x86-64 default LDT is all zeros */
+        if (bytecount > 128) 
+                bytecount = 128;        
+        if (clear_user(ptr, bytecount))
+                return -EFAULT;
+        return bytecount; 
+}
+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+{
+        struct task_struct *me = current;
+        struct mm_struct * mm = me->mm;
+        __u32 entry_1, entry_2, *lp;
+        int error;
+        struct user_desc ldt_info;
+        error = -EINVAL;
+        if (bytecount != sizeof(ldt_info))
+                goto out;
+        error = -EFAULT;        
+        if (copy_from_user(&ldt_info, ptr, bytecount))
+                goto out;
+        error = -EINVAL;
+        if (ldt_info.entry_number >= LDT_ENTRIES)
+                goto out;
+        if (ldt_info.contents == 3) {
+                if (oldmode)
+                        goto out;
+                if (ldt_info.seg_not_present == 0)
+                        goto out;
+        }
+        down(&mm->context.sem);
+        if (ldt_info.entry_number >= (unsigned)mm->context.size) {
+                error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
+                if (error < 0)
+                        goto out_unlock;
+        }
+        lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
+        /* Allow LDTs to be cleared by the user. */
+        if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+                if (oldmode || LDT_empty(&ldt_info)) {
+                        entry_1 = 0;
+                        entry_2 = 0;
+                        goto install;
+                }
+        }
+        entry_1 = LDT_entry_a(&ldt_info);
+        entry_2 = LDT_entry_b(&ldt_info);
+        if (oldmode)
+                entry_2 &= ~(1 << 20);
+        /* Install the new entry ...  */
+install:
+        *lp     = entry_1;
+        *(lp+1) = entry_2;
+        error = 0;
+out_unlock:
+        up(&mm->context.sem);
+out:
+        return error;
+}
+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+{
+        int ret = -ENOSYS;
+        switch (func) {
+        case 0:
+                ret = read_ldt(ptr, bytecount);
+                break;
+        case 1:
+                ret = write_ldt(ptr, bytecount, 1);
+                break;
+        case 2:
+                ret = read_default_ldt(ptr, bytecount);
+                break;
+        case 0x11:
+                ret = write_ldt(ptr, bytecount, 0);
+                break;
+        }
+        return ret;
+}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
new file mode 100644
index 000000000000..86f9fd85016a
--- /dev/null
+++ b/arch/x86_64/kernel/mce.c
@@ -0,0 +1,548 @@
+/*
+ * Machine check handler.
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s). 
+ * 2004 Andi Kleen. Rewrote most of it. 
+ */
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
+#include <linux/sysdev.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <asm/processor.h> 
+#include <asm/msr.h>
+#include <asm/mce.h>
+#include <asm/kdebug.h>
+#include <asm/uaccess.h>
+#define MISC_MCELOG_MINOR 227
+#define NR_BANKS 5
+static int mce_dont_init;
+/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
+   3: never panic or exit (for testing only) */
+static int tolerant = 1;
+static int banks;
+static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
+static unsigned long console_logged;
+static int notify_user;
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+struct mce_log mcelog = { 
+        MCE_LOG_SIGNATURE,
+        MCE_LOG_LEN,
+}; 
+void mce_log(struct mce *mce)
+{
+        unsigned next, entry;
+        mce->finished = 0;
+        smp_wmb();
+        for (;;) {
+                entry = rcu_dereference(mcelog.next);
+                /* When the buffer fills up discard new entries. Assume 
+                   that the earlier errors are the more interesting. */
+                if (entry >= MCE_LOG_LEN) {
+                        set_bit(MCE_OVERFLOW, &mcelog.flags);
+                        return;
+                }
+                /* Old left over entry. Skip. */
+                if (mcelog.entry[entry].finished)
+                        continue;
+                smp_rmb();
+                next = entry + 1;
+                if (cmpxchg(&mcelog.next, entry, next) == entry)
+                        break;
+        }
+        memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+        smp_wmb();
+        mcelog.entry[entry].finished = 1;
+        smp_wmb();
+        if (!test_and_set_bit(0, &console_logged))
+                notify_user = 1;
+}
+static void print_mce(struct mce *m)
+{
+        printk(KERN_EMERG "\n"
+               KERN_EMERG
+               "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+               m->cpu, m->mcgstatus, m->bank, m->status);
+        if (m->rip) {
+                printk(KERN_EMERG 
+                       "RIP%s %02x:<%016Lx> ",
+                       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+                       m->cs, m->rip);
+                if (m->cs == __KERNEL_CS)
+                        print_symbol("{%s}", m->rip);
+                printk("\n");
+        }
+        printk(KERN_EMERG "TSC %Lx ", m->tsc); 
+        if (m->addr)
+                printk("ADDR %Lx ", m->addr);
+        if (m->misc)
+                printk("MISC %Lx ", m->misc);   
+        printk("\n");
+}
+static void mce_panic(char *msg, struct mce *backup, unsigned long start)
+{ 
+        int i;
+        oops_begin();
+        for (i = 0; i < MCE_LOG_LEN; i++) {
+                unsigned long tsc = mcelog.entry[i].tsc;
+                if (time_before(tsc, start))
+                        continue;
+                print_mce(&mcelog.entry[i]); 
+                if (backup && mcelog.entry[i].tsc == backup->tsc)
+                        backup = NULL;
+        }
+        if (backup)
+                print_mce(backup);
+        if (tolerant >= 3)
+                printk("Fake panic: %s\n", msg);
+        else
+                panic(msg);
+} 
+static int mce_available(struct cpuinfo_x86 *c)
+{
+        return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
+               test_bit(X86_FEATURE_MCA, &c->x86_capability);
+}
+/* 
+ * The actual machine check handler
+ */
+void do_machine_check(struct pt_regs * regs, long error_code)
+{
+        struct mce m, panicm;
+        int nowayout = (tolerant < 1); 
+        int kill_it = 0;
+        u64 mcestart = 0;
+        int i;
+        int panicm_found = 0;
+        if (regs)
+                notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
+        if (!banks)
+                return;
+        memset(&m, 0, sizeof(struct mce));
+        m.cpu = hard_smp_processor_id();
+        rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+        if (!(m.mcgstatus & MCG_STATUS_RIPV))
+                kill_it = 1;
+        
+        rdtscll(mcestart);
+        barrier();
+        for (i = 0; i < banks; i++) {
+                if (!bank[i])
+                        continue;
+                
+                m.misc = 0; 
+                m.addr = 0;
+                m.bank = i;
+                m.tsc = 0;
+                rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+                if ((m.status & MCI_STATUS_VAL) == 0)
+                        continue;
+                if (m.status & MCI_STATUS_EN) {
+                        /* In theory _OVER could be a nowayout too, but
+                           assume any overflowed errors were no fatal. */
+                        nowayout |= !!(m.status & MCI_STATUS_PCC);
+                        kill_it |= !!(m.status & MCI_STATUS_UC);
+                }
+                if (m.status & MCI_STATUS_MISCV)
+                        rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+                if (m.status & MCI_STATUS_ADDRV)
+                        rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+                if (regs && (m.mcgstatus & MCG_STATUS_RIPV)) {
+                        m.rip = regs->rip;
+                        m.cs = regs->cs;
+                } else {
+                        m.rip = 0;
+                        m.cs = 0;
+                }
+                if (error_code != -1)
+                        rdtscll(m.tsc);
+                wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
+                mce_log(&m);
+                /* Did this bank cause the exception? */
+                /* Assume that the bank with uncorrectable errors did it,
+                   and that there is only a single one. */
+                if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
+                        panicm = m;
+                        panicm_found = 1;
+                }
+                tainted |= TAINT_MACHINE_CHECK;
+        }
+        /* Never do anything final in the polling timer */
+        if (!regs)
+                goto out;
+        /* If we didn't find an uncorrectable error, pick
+           the last one (shouldn't happen, just being safe). */
+        if (!panicm_found)
+                panicm = m;
+        if (nowayout)
+                mce_panic("Machine check", &panicm, mcestart);
+        if (kill_it) {
+                int user_space = 0;
+                if (m.mcgstatus & MCG_STATUS_RIPV)
+                        user_space = panicm.rip && (panicm.cs & 3);
+                
+                /* When the machine was in user space and the CPU didn't get
+                   confused it's normally not necessary to panic, unless you 
+                   are paranoid (tolerant == 0)
+                   RED-PEN could be more tolerant for MCEs in idle,
+                   but most likely they occur at boot anyways, where
+                   it is best to just halt the machine. */
+                if ((!user_space && (panic_on_oops || tolerant < 2)) ||
+                    (unsigned)current->pid <= 1)
+                        mce_panic("Uncorrected machine check", &panicm, mcestart);
+                /* do_exit takes an awful lot of locks and has as
+                   slight risk of deadlocking. If you don't want that
+                   don't set tolerant >= 2 */
+                if (tolerant < 3)
+                        do_exit(SIGBUS);
+        }
+ out:
+        /* Last thing done in the machine check exception to clear state. */
+        wrmsrl(MSR_IA32_MCG_STATUS, 0);
+}
+/*
+ * Periodic polling timer for "silent" machine check errors.
+ */
+static int check_interval = 5 * 60; /* 5 minutes */
+static void mcheck_timer(void *data);
+static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
+static void mcheck_check_cpu(void *info)
+{
+        if (mce_available(&current_cpu_data))
+                do_machine_check(NULL, 0);
+}
+static void mcheck_timer(void *data)
+{
+        on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
+        schedule_delayed_work(&mcheck_work, check_interval * HZ);
+        /*
+         * It's ok to read stale data here for notify_user and
+         * console_logged as we'll simply get the updated versions
+         * on the next mcheck_timer execution and atomic operations
+         * on console_logged act as synchronization for notify_user
+         * writes.
+         */
+        if (notify_user && console_logged) {
+                notify_user = 0;
+                clear_bit(0, &console_logged);
+                printk(KERN_INFO "Machine check events logged\n");
+        }
+}
+static __init int periodic_mcheck_init(void)
+{ 
+        if (check_interval)
+                schedule_delayed_work(&mcheck_work, check_interval*HZ);
+        return 0;
+} 
+__initcall(periodic_mcheck_init);
+/* 
+ * Initialize Machine Checks for a CPU.
+ */
+static void mce_init(void *dummy)
+{
+        u64 cap;
+        int i;
+        rdmsrl(MSR_IA32_MCG_CAP, cap);
+        banks = cap & 0xff;
+        if (banks > NR_BANKS) { 
+                printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
+                banks = NR_BANKS; 
+        }
+        /* Log the machine checks left over from the previous reset.
+           This also clears all registers */
+        do_machine_check(NULL, -1);
+        set_in_cr4(X86_CR4_MCE);
+        if (cap & MCG_CTL_P)
+                wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+        for (i = 0; i < banks; i++) {
+                wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+                wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+        }       
+}
+/* Add per CPU specific workarounds here */
+static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) 
+{ 
+        /* This should be disabled by the BIOS, but isn't always */
+        if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
+                /* disable GART TBL walk error reporting, which trips off 
+                   incorrectly with the IOMMU & 3ware & Cerberus. */
+                clear_bit(10, &bank[4]);
+        }
+}                       
+static void __init mce_cpu_features(struct cpuinfo_x86 *c)
+{
+        switch (c->x86_vendor) {
+        case X86_VENDOR_INTEL:
+                mce_intel_feature_init(c);
+                break;
+        default:
+                break;
+        }
+}
+/* 
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off. 
+ */
+void __init mcheck_init(struct cpuinfo_x86 *c)
+{
+        static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
+        mce_cpu_quirks(c); 
+        if (mce_dont_init ||
+            cpu_test_and_set(smp_processor_id(), mce_cpus) ||
+            !mce_available(c))
+                return;
+        mce_init(NULL);
+        mce_cpu_features(c);
+}
+/*
+ * Character device to read and clear the MCE log.
+ */
+static void collect_tscs(void *data) 
+{ 
+        unsigned long *cpu_tsc = (unsigned long *)data;
+        rdtscll(cpu_tsc[smp_processor_id()]);
+} 
+static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
+{
+        unsigned long cpu_tsc[NR_CPUS];
+        static DECLARE_MUTEX(mce_read_sem);
+        unsigned next;
+        char __user *buf = ubuf;
+        int i, err;
+        down(&mce_read_sem); 
+        next = rcu_dereference(mcelog.next);
+        /* Only supports full reads right now */
+        if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 
+                up(&mce_read_sem);
+                return -EINVAL;
+        }
+        err = 0;
+        for (i = 0; i < next; i++) {
+                if (!mcelog.entry[i].finished)
+                        continue;
+                smp_rmb();
+                err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
+                buf += sizeof(struct mce); 
+        } 
+        memset(mcelog.entry, 0, next * sizeof(struct mce));
+        mcelog.next = 0;
+        synchronize_kernel();   
+        /* Collect entries that were still getting written before the synchronize. */
+        on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
+        for (i = next; i < MCE_LOG_LEN; i++) { 
+                if (mcelog.entry[i].finished && 
+                    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {  
+                        err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
+                        smp_rmb();
+                        buf += sizeof(struct mce);
+                        memset(&mcelog.entry[i], 0, sizeof(struct mce));
+                }
+        }       
+        up(&mce_read_sem);
+        return err ? -EFAULT : buf - ubuf; 
+}
+static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
+{
+        int __user *p = (int __user *)arg;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM; 
+        switch (cmd) {
+        case MCE_GET_RECORD_LEN: 
+                return put_user(sizeof(struct mce), p);
+        case MCE_GET_LOG_LEN:
+                return put_user(MCE_LOG_LEN, p);                
+        case MCE_GETCLEAR_FLAGS: {
+                unsigned flags;
+                do { 
+                        flags = mcelog.flags;
+                } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 
+                return put_user(flags, p); 
+        }
+        default:
+                return -ENOTTY; 
+        } 
+}
+static struct file_operations mce_chrdev_ops = {
+        .read = mce_read,
+        .ioctl = mce_ioctl,
+};
+static struct miscdevice mce_log_device = {
+        MISC_MCELOG_MINOR,
+        "mcelog",
+        &mce_chrdev_ops,
+};
+/* 
+ * Old style boot options parsing. Only for compatibility. 
+ */
+static int __init mcheck_disable(char *str)
+{
+        mce_dont_init = 1;
+        return 0;
+}
+/* mce=off disables machine check. Note you can reenable it later
+   using sysfs */
+static int __init mcheck_enable(char *str)
+{
+        if (!strcmp(str, "off"))
+                mce_dont_init = 1;
+        else
+                printk("mce= argument %s ignored. Please use /sys", str); 
+        return 0;
+}
+__setup("nomce", mcheck_disable);
+__setup("mce", mcheck_enable);
+/* 
+ * Sysfs support
+ */ 
+/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */
+static int mce_resume(struct sys_device *dev)
+{
+        on_each_cpu(mce_init, NULL, 1, 1);
+        return 0;
+}
+/* Reinit MCEs after user configuration changes */
+static void mce_restart(void) 
+{ 
+        if (check_interval)
+                cancel_delayed_work(&mcheck_work);
+        /* Timer race is harmless here */
+        on_each_cpu(mce_init, NULL, 1, 1);       
+        if (check_interval)
+                schedule_delayed_work(&mcheck_work, check_interval*HZ);
+}
+static struct sysdev_class mce_sysclass = {
+        .resume = mce_resume,
+        set_kset_name("machinecheck"),
+};
+static struct sys_device device_mce = {
+        .id     = 0,
+        .cls    = &mce_sysclass,
+};
+/* Why are there no generic functions for this? */
+#define ACCESSOR(name, var, start) \
+        static ssize_t show_ ## name(struct sys_device *s, char *buf) {                    \
+                return sprintf(buf, "%lx\n", (unsigned long)var);                  \
+        }                                                                          \
+        static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
+                char *end;                                                         \
+                unsigned long new = simple_strtoul(buf, &end, 0);                  \
+                if (end == buf) return -EINVAL;                                    \
+                var = new;                                                         \
+                start;                                                             \
+                return end-buf;                                                    \
+        }                                                                          \
+        static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
+ACCESSOR(bank0ctl,bank[0],mce_restart())
+ACCESSOR(bank1ctl,bank[1],mce_restart())
+ACCESSOR(bank2ctl,bank[2],mce_restart())
+ACCESSOR(bank3ctl,bank[3],mce_restart())
+ACCESSOR(bank4ctl,bank[4],mce_restart())
+ACCESSOR(tolerant,tolerant,)
+ACCESSOR(check_interval,check_interval,mce_restart())
+static __init int mce_init_device(void)
+{
+        int err;
+        if (!mce_available(&boot_cpu_data))
+                return -EIO;
+        err = sysdev_class_register(&mce_sysclass);
+        if (!err)
+                err = sysdev_register(&device_mce);
+        if (!err) { 
+                /* could create per CPU objects, but it is not worth it. */
+                sysdev_create_file(&device_mce, &attr_bank0ctl); 
+                sysdev_create_file(&device_mce, &attr_bank1ctl); 
+                sysdev_create_file(&device_mce, &attr_bank2ctl); 
+                sysdev_create_file(&device_mce, &attr_bank3ctl); 
+                sysdev_create_file(&device_mce, &attr_bank4ctl); 
+                sysdev_create_file(&device_mce, &attr_tolerant); 
+                sysdev_create_file(&device_mce, &attr_check_interval);
+        } 
+        
+        misc_register(&mce_log_device);
+        return err;
+}
+device_initcall(mce_init_device);
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
new file mode 100644
index 000000000000..4db9a640069f
--- /dev/null
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -0,0 +1,99 @@
+/*
+ * Intel specific MCE features.
+ * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ */
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+#include <asm/hw_irq.h>
+static DEFINE_PER_CPU(unsigned long, next_check);
+asmlinkage void smp_thermal_interrupt(void)
+{
+        struct mce m;
+        ack_APIC_irq();
+        irq_enter();
+        if (time_before(jiffies, __get_cpu_var(next_check)))
+                goto done;
+        __get_cpu_var(next_check) = jiffies + HZ*300;
+        memset(&m, 0, sizeof(m));
+        m.cpu = smp_processor_id();
+        m.bank = MCE_THERMAL_BANK;
+        rdtscll(m.tsc);
+        rdmsrl(MSR_IA32_THERM_STATUS, m.status);
+        if (m.status & 0x1) {
+                printk(KERN_EMERG
+                        "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu);
+                add_taint(TAINT_MACHINE_CHECK);
+        } else {
+                printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu);
+        }
+        mce_log(&m);
+done:
+        irq_exit();
+}
+static void __init intel_init_thermal(struct cpuinfo_x86 *c)
+{
+        u32 l, h;
+        int tm2 = 0;
+        unsigned int cpu = smp_processor_id();
+        if (!cpu_has(c, X86_FEATURE_ACPI))
+                return;
+        if (!cpu_has(c, X86_FEATURE_ACC))
+                return;
+        /* first check if TM1 is already enabled by the BIOS, in which
+         * case there might be some SMM goo which handles it, so we can't even
+         * put a handler since it might be delivered via SMI already.
+         */
+        rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+        h = apic_read(APIC_LVTTHMR);
+        if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
+                printk(KERN_DEBUG
+                       "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+                return;
+        }
+        if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
+                tm2 = 1;
+        if (h & APIC_VECTOR_MASK) {
+                printk(KERN_DEBUG
+                       "CPU%d: Thermal LVT vector (%#x) already "
+                       "installed\n", cpu, (h & APIC_VECTOR_MASK));
+                return;
+        }
+        h = THERMAL_APIC_VECTOR;
+        h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
+        apic_write_around(APIC_LVTTHMR, h);
+        rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+        wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
+        rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+        wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
+        l = apic_read(APIC_LVTTHMR);
+        apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+        printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
+                cpu, tm2 ? "TM2" : "TM1");
+        return;
+}
+void __init mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+        intel_init_thermal(c);
+}
diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c
new file mode 100644
index 000000000000..c2ffea8845ed
--- /dev/null
+++ b/arch/x86_64/kernel/module.c
@@ -0,0 +1,166 @@
+/*  Kernel module help for x86-64
+    Copyright (C) 2001 Rusty Russell.
+    Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/moduleloader.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#define DEBUGP(fmt...) 
+void module_free(struct module *mod, void *module_region)
+{
+        vfree(module_region);
+}
+void *module_alloc(unsigned long size)
+{
+        struct vm_struct *area;
+        if (!size)
+                return NULL;
+        size = PAGE_ALIGN(size);
+        if (size > MODULES_LEN)
+                return NULL;
+        area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
+        if (!area)
+                return NULL;
+        return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
+}
+/* We don't need anything special. */
+int module_frob_arch_sections(Elf_Ehdr *hdr,
+                              Elf_Shdr *sechdrs,
+                              char *secstrings,
+                              struct module *mod)
+{
+        return 0;
+}
+int apply_relocate_add(Elf64_Shdr *sechdrs,
+                   const char *strtab,
+                   unsigned int symindex,
+                   unsigned int relsec,
+                   struct module *me)
+{
+        unsigned int i;
+        Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
+        Elf64_Sym *sym;
+        void *loc;
+        u64 val; 
+        DEBUGP("Applying relocate section %u to %u\n", relsec,
+               sechdrs[relsec].sh_info);
+        for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+                /* This is where to make the change */
+                loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+                        + rel[i].r_offset;
+                /* This is the symbol it is referring to.  Note that all
+                   undefined symbols have been resolved.  */
+                sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
+                        + ELF64_R_SYM(rel[i].r_info);
+                DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
+                       (int)ELF64_R_TYPE(rel[i].r_info), 
+                       sym->st_value, rel[i].r_addend, (u64)loc);
+                val = sym->st_value + rel[i].r_addend; 
+                switch (ELF64_R_TYPE(rel[i].r_info)) {
+                case R_X86_64_NONE:
+                        break;
+                case R_X86_64_64:
+                        *(u64 *)loc = val;
+                        break;
+                case R_X86_64_32:
+                        *(u32 *)loc = val;
+                        if (val != *(u32 *)loc)
+                                goto overflow;
+                        break;
+                case R_X86_64_32S:
+                        *(s32 *)loc = val;
+                        if ((s64)val != *(s32 *)loc)
+                                goto overflow;
+                        break;
+                case R_X86_64_PC32: 
+                        val -= (u64)loc;
+                        *(u32 *)loc = val;
+#if 0
+                        if ((s64)val != *(s32 *)loc)
+                                goto overflow; 
+#endif
+                        break;
+                default:
+                        printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n",
+                               me->name, ELF64_R_TYPE(rel[i].r_info));
+                        return -ENOEXEC;
+                }
+        }
+        return 0;
+overflow:
+        printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 
+               (int)ELF64_R_TYPE(rel[i].r_info), val);
+        printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
+               me->name);
+        return -ENOEXEC;
+}
+int apply_relocate(Elf_Shdr *sechdrs,
+                   const char *strtab,
+                   unsigned int symindex,
+                   unsigned int relsec,
+                   struct module *me)
+{
+        printk("non add relocation not supported\n");
+        return -ENOSYS;
+} 
+extern void apply_alternatives(void *start, void *end); 
+int module_finalize(const Elf_Ehdr *hdr,
+                    const Elf_Shdr *sechdrs,
+                    struct module *me)
+{
+        const Elf_Shdr *s;
+        char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+        /* look for .altinstructions to patch */ 
+        for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 
+                void *seg;              
+                if (strcmp(".altinstructions", secstrings + s->sh_name))
+                        continue;
+                seg = (void *)s->sh_addr; 
+                apply_alternatives(seg, seg + s->sh_size); 
+        }       
+        return 0;
+}
+void module_arch_cleanup(struct module *mod)
+{
+}
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
new file mode 100644
index 000000000000..7ec031c6ca10
--- /dev/null
+++ b/arch/x86_64/kernel/mpparse.c
@@ -0,0 +1,949 @@
+/*
+ *      Intel Multiprocessor Specification 1.1 and 1.4
+ *      compliant MP-table parsing routines.
+ *
+ *      (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *      (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *      Fixes
+ *              Erich Boleyn    :       MP v1.4 and additional changes.
+ *              Alan Cox        :       Added EBDA scanning
+ *              Ingo Molnar     :       various cleanups and rewrites
+ *              Maciej W. Rozycki:      Bits for default MP configurations
+ *              Paul Diefenbaugh:       Added full ACPI support
+ */
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/config.h>
+#include <linux/bootmem.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+/* Have we found an MP table */
+int smp_found_config;
+unsigned int __initdata maxcpus = NR_CPUS;
+int acpi_found_madt;
+/*
+ * Various Linux-internal data structures created from the
+ * MP-table.
+ */
+int apic_version [MAX_APICS];
+unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL };
+static int mp_current_pci_id = 0;
+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+/* # of MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+/* MP IRQ source entries */
+int mp_irq_entries;
+int nr_ioapics;
+int pic_mode;
+unsigned long mp_lapic_addr = 0;
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_id = -1U;
+/* Internal processor count */
+static unsigned int num_processors = 0;
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
+/* ACPI MADT entry parsing functions */
+#ifdef CONFIG_ACPI_BOOT
+extern struct acpi_boot_flags acpi_boot;
+#ifdef CONFIG_X86_LOCAL_APIC
+extern int acpi_parse_lapic (acpi_table_entry_header *header);
+extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
+extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
+#endif /*CONFIG_X86_LOCAL_APIC*/
+#ifdef CONFIG_X86_IO_APIC
+extern int acpi_parse_ioapic (acpi_table_entry_header *header);
+#endif /*CONFIG_X86_IO_APIC*/
+#endif /*CONFIG_ACPI_BOOT*/
+u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+/*
+ * Intel MP BIOS table parsing routines:
+ */
+/*
+ * Checksum an MP configuration block.
+ */
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+        int sum = 0;
+        while (len--)
+                sum += *mp++;
+        return sum & 0xFF;
+}
+static void __init MP_processor_info (struct mpc_config_processor *m)
+{
+        int ver;
+        if (!(m->mpc_cpuflag & CPU_ENABLED))
+                return;
+        printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
+                m->mpc_apicid,
+               (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
+               (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
+                m->mpc_apicver);
+        if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+                Dprintk("    Bootup CPU\n");
+                boot_cpu_id = m->mpc_apicid;
+        }
+        if (num_processors >= NR_CPUS) {
+                printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+                        " Processor ignored.\n", NR_CPUS);
+                return;
+        }
+        if (num_processors >= maxcpus) {
+                printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
+                        " Processor ignored.\n", maxcpus);
+                return;
+        }
+        num_processors++;
+        if (m->mpc_apicid > MAX_APICS) {
+                printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
+                        m->mpc_apicid, MAX_APICS);
+                return;
+        }
+        ver = m->mpc_apicver;
+        physid_set(m->mpc_apicid, phys_cpu_present_map);
+        /*
+         * Validate version
+         */
+        if (ver == 0x0) {
+                printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
+                ver = 0x10;
+        }
+        apic_version[m->mpc_apicid] = ver;
+        bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
+}
+static void __init MP_bus_info (struct mpc_config_bus *m)
+{
+        char str[7];
+        memcpy(str, m->mpc_bustype, 6);
+        str[6] = 0;
+        Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+        if (strncmp(str, "ISA", 3) == 0) {
+                mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
+        } else if (strncmp(str, "EISA", 4) == 0) {
+                mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+        } else if (strncmp(str, "PCI", 3) == 0) {
+                mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+                mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
+                mp_current_pci_id++;
+        } else if (strncmp(str, "MCA", 3) == 0) {
+                mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
+        } else {
+                printk(KERN_ERR "Unknown bustype %s\n", str);
+        }
+}
+static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
+{
+        if (!(m->mpc_flags & MPC_APIC_USABLE))
+                return;
+        printk("I/O APIC #%d Version %d at 0x%X.\n",
+                m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+        if (nr_ioapics >= MAX_IO_APICS) {
+                printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
+                        MAX_IO_APICS, nr_ioapics);
+                panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
+        }
+        if (!m->mpc_apicaddr) {
+                printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
+                        " found in MP table, skipping!\n");
+                return;
+        }
+        mp_ioapics[nr_ioapics] = *m;
+        nr_ioapics++;
+}
+static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
+{
+        mp_irqs [mp_irq_entries] = *m;
+        Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+                " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+                        m->mpc_irqtype, m->mpc_irqflag & 3,
+                        (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
+                        m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+        if (++mp_irq_entries == MAX_IRQ_SOURCES)
+                panic("Max # of irq sources exceeded!!\n");
+}
+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
+{
+        Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+                " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+                        m->mpc_irqtype, m->mpc_irqflag & 3,
+                        (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
+                        m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+        /*
+         * Well it seems all SMP boards in existence
+         * use ExtINT/LVT1 == LINT0 and
+         * NMI/LVT2 == LINT1 - the following check
+         * will show us if this assumptions is false.
+         * Until then we do not have to add baggage.
+         */
+        if ((m->mpc_irqtype == mp_ExtINT) &&
+                (m->mpc_destapiclint != 0))
+                        BUG();
+        if ((m->mpc_irqtype == mp_NMI) &&
+                (m->mpc_destapiclint != 1))
+                        BUG();
+}
+/*
+ * Read/parse the MPC
+ */
+static int __init smp_read_mpc(struct mp_config_table *mpc)
+{
+        char str[16];
+        int count=sizeof(*mpc);
+        unsigned char *mpt=((unsigned char *)mpc)+count;
+        if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
+                printk("SMP mptable: bad signature [%c%c%c%c]!\n",
+                        mpc->mpc_signature[0],
+                        mpc->mpc_signature[1],
+                        mpc->mpc_signature[2],
+                        mpc->mpc_signature[3]);
+                return 0;
+        }
+        if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
+                printk("SMP mptable: checksum error!\n");
+                return 0;
+        }
+        if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
+                printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
+                        mpc->mpc_spec);
+                return 0;
+        }
+        if (!mpc->mpc_lapic) {
+                printk(KERN_ERR "SMP mptable: null local APIC address!\n");
+                return 0;
+        }
+        memcpy(str,mpc->mpc_oem,8);
+        str[8]=0;
+        printk(KERN_INFO "OEM ID: %s ",str);
+        memcpy(str,mpc->mpc_productid,12);
+        str[12]=0;
+        printk(KERN_INFO "Product ID: %s ",str);
+        printk(KERN_INFO "APIC at: 0x%X\n",mpc->mpc_lapic);
+        /* save the local APIC address, it might be non-default */
+        if (!acpi_lapic)
+        mp_lapic_addr = mpc->mpc_lapic;
+        /*
+         *      Now process the configuration blocks.
+         */
+        while (count < mpc->mpc_length) {
+                switch(*mpt) {
+                        case MP_PROCESSOR:
+                        {
+                                struct mpc_config_processor *m=
+                                        (struct mpc_config_processor *)mpt;
+                                if (!acpi_lapic)
+                                MP_processor_info(m);
+                                mpt += sizeof(*m);
+                                count += sizeof(*m);
+                                break;
+                        }
+                        case MP_BUS:
+                        {
+                                struct mpc_config_bus *m=
+                                        (struct mpc_config_bus *)mpt;
+                                MP_bus_info(m);
+                                mpt += sizeof(*m);
+                                count += sizeof(*m);
+                                break;
+                        }
+                        case MP_IOAPIC:
+                        {
+                                struct mpc_config_ioapic *m=
+                                        (struct mpc_config_ioapic *)mpt;
+                                MP_ioapic_info(m);
+                                mpt+=sizeof(*m);
+                                count+=sizeof(*m);
+                                break;
+                        }
+                        case MP_INTSRC:
+                        {
+                                struct mpc_config_intsrc *m=
+                                        (struct mpc_config_intsrc *)mpt;
+                                MP_intsrc_info(m);
+                                mpt+=sizeof(*m);
+                                count+=sizeof(*m);
+                                break;
+                        }
+                        case MP_LINTSRC:
+                        {
+                                struct mpc_config_lintsrc *m=
+                                        (struct mpc_config_lintsrc *)mpt;
+                                MP_lintsrc_info(m);
+                                mpt+=sizeof(*m);
+                                count+=sizeof(*m);
+                                break;
+                        }
+                }
+        }
+        clustered_apic_check();
+        if (!num_processors)
+                printk(KERN_ERR "SMP mptable: no processors registered!\n");
+        return num_processors;
+}
+static int __init ELCR_trigger(unsigned int irq)
+{
+        unsigned int port;
+        port = 0x4d0 + (irq >> 3);
+        return (inb(port) >> (irq & 7)) & 1;
+}
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+        struct mpc_config_intsrc intsrc;
+        int i;
+        int ELCR_fallback = 0;
+        intsrc.mpc_type = MP_INTSRC;
+        intsrc.mpc_irqflag = 0;                 /* conforming */
+        intsrc.mpc_srcbus = 0;
+        intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+        intsrc.mpc_irqtype = mp_INT;
+        /*
+         *  If true, we have an ISA/PCI system with no IRQ entries
+         *  in the MP table. To prevent the PCI interrupts from being set up
+         *  incorrectly, we try to use the ELCR. The sanity check to see if
+         *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+         *  never be level sensitive, so we simply see if the ELCR agrees.
+         *  If it does, we assume it's valid.
+         */
+        if (mpc_default_type == 5) {
+                printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
+                if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
+                        printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
+                else {
+                        printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
+                        ELCR_fallback = 1;
+                }
+        }
+        for (i = 0; i < 16; i++) {
+                switch (mpc_default_type) {
+                case 2:
+                        if (i == 0 || i == 13)
+                                continue;       /* IRQ0 & IRQ13 not connected */
+                        /* fall through */
+                default:
+                        if (i == 2)
+                                continue;       /* IRQ2 is never connected */
+                }
+                if (ELCR_fallback) {
+                        /*
+                         *  If the ELCR indicates a level-sensitive interrupt, we
+                         *  copy that information over to the MP table in the
+                         *  irqflag field (level sensitive, active high polarity).
+                         */
+                        if (ELCR_trigger(i))
+                                intsrc.mpc_irqflag = 13;
+                        else
+                                intsrc.mpc_irqflag = 0;
+                }
+                intsrc.mpc_srcbusirq = i;
+                intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
+                MP_intsrc_info(&intsrc);
+        }
+        intsrc.mpc_irqtype = mp_ExtINT;
+        intsrc.mpc_srcbusirq = 0;
+        intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
+        MP_intsrc_info(&intsrc);
+}
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+        struct mpc_config_processor processor;
+        struct mpc_config_bus bus;
+        struct mpc_config_ioapic ioapic;
+        struct mpc_config_lintsrc lintsrc;
+        int linttypes[2] = { mp_ExtINT, mp_NMI };
+        int i;
+        /*
+         * local APIC has default address
+         */
+        mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+        /*
+         * 2 CPUs, numbered 0 & 1.
+         */
+        processor.mpc_type = MP_PROCESSOR;
+        /* Either an integrated APIC or a discrete 82489DX. */
+        processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+        processor.mpc_cpuflag = CPU_ENABLED;
+        processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+                                   (boot_cpu_data.x86_model << 4) |
+                                   boot_cpu_data.x86_mask;
+        processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+        processor.mpc_reserved[0] = 0;
+        processor.mpc_reserved[1] = 0;
+        for (i = 0; i < 2; i++) {
+                processor.mpc_apicid = i;
+                MP_processor_info(&processor);
+        }
+        bus.mpc_type = MP_BUS;
+        bus.mpc_busid = 0;
+        switch (mpc_default_type) {
+                default:
+                        printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+                                mpc_default_type);
+                        /* fall through */
+                case 1:
+                case 5:
+                        memcpy(bus.mpc_bustype, "ISA   ", 6);
+                        break;
+                case 2:
+                case 6:
+                case 3:
+                        memcpy(bus.mpc_bustype, "EISA  ", 6);
+                        break;
+                case 4:
+                case 7:
+                        memcpy(bus.mpc_bustype, "MCA   ", 6);
+        }
+        MP_bus_info(&bus);
+        if (mpc_default_type > 4) {
+                bus.mpc_busid = 1;
+                memcpy(bus.mpc_bustype, "PCI   ", 6);
+                MP_bus_info(&bus);
+        }
+        ioapic.mpc_type = MP_IOAPIC;
+        ioapic.mpc_apicid = 2;
+        ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+        ioapic.mpc_flags = MPC_APIC_USABLE;
+        ioapic.mpc_apicaddr = 0xFEC00000;
+        MP_ioapic_info(&ioapic);
+        /*
+         * We set up most of the low 16 IO-APIC pins according to MPS rules.
+         */
+        construct_default_ioirq_mptable(mpc_default_type);
+        lintsrc.mpc_type = MP_LINTSRC;
+        lintsrc.mpc_irqflag = 0;                /* conforming */
+        lintsrc.mpc_srcbusid = 0;
+        lintsrc.mpc_srcbusirq = 0;
+        lintsrc.mpc_destapic = MP_APIC_ALL;
+        for (i = 0; i < 2; i++) {
+                lintsrc.mpc_irqtype = linttypes[i];
+                lintsrc.mpc_destapiclint = i;
+                MP_lintsrc_info(&lintsrc);
+        }
+}
+static struct intel_mp_floating *mpf_found;
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+void __init get_smp_config (void)
+{
+        struct intel_mp_floating *mpf = mpf_found;
+        /*
+         * ACPI may be used to obtain the entire SMP configuration or just to 
+         * enumerate/configure processors (CONFIG_ACPI_BOOT).  Note that 
+         * ACPI supports both logical (e.g. Hyper-Threading) and physical 
+         * processors, where MPS only supports physical.
+         */
+        if (acpi_lapic && acpi_ioapic) {
+                printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
+                return;
+        }
+        else if (acpi_lapic)
+                printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
+        printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
+        if (mpf->mpf_feature2 & (1<<7)) {
+                printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
+                pic_mode = 1;
+        } else {
+                printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
+                pic_mode = 0;
+        }
+        /*
+         * Now see if we need to read further.
+         */
+        if (mpf->mpf_feature1 != 0) {
+                printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
+                construct_default_ISA_mptable(mpf->mpf_feature1);
+        } else if (mpf->mpf_physptr) {
+                /*
+                 * Read the physical hardware table.  Anything here will
+                 * override the defaults.
+                 */
+                if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) {
+                        smp_found_config = 0;
+                        printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
+                        printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
+                        return;
+                }
+                /*
+                 * If there are no explicit MP IRQ entries, then we are
+                 * broken.  We set up most of the low 16 IO-APIC pins to
+                 * ISA defaults and hope it will work.
+                 */
+                if (!mp_irq_entries) {
+                        struct mpc_config_bus bus;
+                        printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
+                        bus.mpc_type = MP_BUS;
+                        bus.mpc_busid = 0;
+                        memcpy(bus.mpc_bustype, "ISA   ", 6);
+                        MP_bus_info(&bus);
+                        construct_default_ioirq_mptable(0);
+                }
+        } else
+                BUG();
+        printk(KERN_INFO "Processors: %d\n", num_processors);
+        /*
+         * Only use the first configuration found.
+         */
+}
+static int __init smp_scan_config (unsigned long base, unsigned long length)
+{
+        extern void __bad_mpf_size(void); 
+        unsigned int *bp = phys_to_virt(base);
+        struct intel_mp_floating *mpf;
+        Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
+        if (sizeof(*mpf) != 16)
+                __bad_mpf_size();
+        while (length > 0) {
+                mpf = (struct intel_mp_floating *)bp;
+                if ((*bp == SMP_MAGIC_IDENT) &&
+                        (mpf->mpf_length == 1) &&
+                        !mpf_checksum((unsigned char *)bp, 16) &&
+                        ((mpf->mpf_specification == 1)
+                                || (mpf->mpf_specification == 4)) ) {
+                        smp_found_config = 1;
+                        reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
+                        if (mpf->mpf_physptr)
+                                reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE);
+                        mpf_found = mpf;
+                        return 1;
+                }
+                bp += 4;
+                length -= 16;
+        }
+        return 0;
+}
+void __init find_intel_smp (void)
+{
+        unsigned int address;
+        /*
+         * FIXME: Linux assumes you have 640K of base ram..
+         * this continues the error...
+         *
+         * 1) Scan the bottom 1K for a signature
+         * 2) Scan the top 1K of base RAM
+         * 3) Scan the 64K of bios
+         */
+        if (smp_scan_config(0x0,0x400) ||
+                smp_scan_config(639*0x400,0x400) ||
+                        smp_scan_config(0xF0000,0x10000))
+                return;
+        /*
+         * If it is an SMP machine we should know now, unless the
+         * configuration is in an EISA/MCA bus machine with an
+         * extended bios data area.
+         *
+         * there is a real-mode segmented pointer pointing to the
+         * 4K EBDA area at 0x40E, calculate and scan it here.
+         *
+         * NOTE! There are Linux loaders that will corrupt the EBDA
+         * area, and as such this kind of SMP config may be less
+         * trustworthy, simply because the SMP table may have been
+         * stomped on during early boot. These loaders are buggy and
+         * should be fixed.
+         */
+        address = *(unsigned short *)phys_to_virt(0x40E);
+        address <<= 4;
+        if (smp_scan_config(address, 0x1000))
+                return;
+        /* If we have come this far, we did not find an MP table  */
+         printk(KERN_INFO "No mptable found.\n");
+}
+/*
+ * - Intel MP Configuration Table
+ */
+void __init find_smp_config (void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+        find_intel_smp();
+#endif
+}
+/* --------------------------------------------------------------------------
+                            ACPI-based MP Configuration
+   -------------------------------------------------------------------------- */
+#ifdef CONFIG_ACPI_BOOT
+void __init mp_register_lapic_address (
+        u64                     address)
+{
+        mp_lapic_addr = (unsigned long) address;
+        set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+        if (boot_cpu_id == -1U)
+                boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
+        Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
+}
+void __init mp_register_lapic (
+        u8                      id, 
+        u8                      enabled)
+{
+        struct mpc_config_processor processor;
+        int                     boot_cpu = 0;
+        
+        if (id >= MAX_APICS) {
+                printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
+                        id, MAX_APICS);
+                return;
+        }
+        if (id == boot_cpu_physical_apicid)
+                boot_cpu = 1;
+        processor.mpc_type = MP_PROCESSOR;
+        processor.mpc_apicid = id;
+        processor.mpc_apicver = 0x10; /* TBD: lapic version */
+        processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
+        processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
+        processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
+                (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+        processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+        processor.mpc_reserved[0] = 0;
+        processor.mpc_reserved[1] = 0;
+        MP_processor_info(&processor);
+}
+#ifdef CONFIG_X86_IO_APIC
+#define MP_ISA_BUS              0
+#define MP_MAX_IOAPIC_PIN       127
+static struct mp_ioapic_routing {
+        int                     apic_id;
+        int                     gsi_start;
+        int                     gsi_end;
+        u32                     pin_programmed[4];
+} mp_ioapic_routing[MAX_IO_APICS];
+static int mp_find_ioapic (
+        int                     gsi)
+{
+        int                     i = 0;
+        /* Find the IOAPIC that manages this GSI. */
+        for (i = 0; i < nr_ioapics; i++) {
+                if ((gsi >= mp_ioapic_routing[i].gsi_start)
+                        && (gsi <= mp_ioapic_routing[i].gsi_end))
+                        return i;
+        }
+        printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+        return -1;
+}
+        
+void __init mp_register_ioapic (
+        u8                      id, 
+        u32                     address,
+        u32                     gsi_base)
+{
+        int                     idx = 0;
+        if (nr_ioapics >= MAX_IO_APICS) {
+                printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+                        "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+                panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+        }
+        if (!address) {
+                printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+                        " found in MADT table, skipping!\n");
+                return;
+        }
+        idx = nr_ioapics++;
+        mp_ioapics[idx].mpc_type = MP_IOAPIC;
+        mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
+        mp_ioapics[idx].mpc_apicaddr = address;
+        set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+        mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
+        mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+        
+        /* 
+         * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
+         * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
+         */
+        mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+        mp_ioapic_routing[idx].gsi_start = gsi_base;
+        mp_ioapic_routing[idx].gsi_end = gsi_base + 
+                io_apic_get_redir_entries(idx);
+        printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+                "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
+                mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+                mp_ioapic_routing[idx].gsi_start,
+                mp_ioapic_routing[idx].gsi_end);
+        return;
+}
+void __init mp_override_legacy_irq (
+        u8                      bus_irq,
+        u8                      polarity, 
+        u8                      trigger, 
+        u32                     gsi)
+{
+        struct mpc_config_intsrc intsrc;
+        int                     ioapic = -1;
+        int                     pin = -1;
+        /* 
+         * Convert 'gsi' to 'ioapic.pin'.
+         */
+        ioapic = mp_find_ioapic(gsi);
+        if (ioapic < 0)
+                return;
+        pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
+        /*
+         * TBD: This check is for faulty timer entries, where the override
+         *      erroneously sets the trigger to level, resulting in a HUGE 
+         *      increase of timer interrupts!
+         */
+        if ((bus_irq == 0) && (trigger == 3))
+                trigger = 1;
+        intsrc.mpc_type = MP_INTSRC;
+        intsrc.mpc_irqtype = mp_INT;
+        intsrc.mpc_irqflag = (trigger << 2) | polarity;
+        intsrc.mpc_srcbus = MP_ISA_BUS;
+        intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
+        intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
+        intsrc.mpc_dstirq = pin;                                    /* INTIN# */
+        Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", 
+                intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+                (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+                intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
+        mp_irqs[mp_irq_entries] = intsrc;
+        if (++mp_irq_entries == MAX_IRQ_SOURCES)
+                panic("Max # of irq sources exceeded!\n");
+        return;
+}
+void __init mp_config_acpi_legacy_irqs (void)
+{
+        struct mpc_config_intsrc intsrc;
+        int                     i = 0;
+        int                     ioapic = -1;
+        /* 
+         * Fabricate the legacy ISA bus (bus #31).
+         */
+        mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+        Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+        /* 
+         * Locate the IOAPIC that manages the ISA IRQs (0-15). 
+         */
+        ioapic = mp_find_ioapic(0);
+        if (ioapic < 0)
+                return;
+        intsrc.mpc_type = MP_INTSRC;
+        intsrc.mpc_irqflag = 0;                                 /* Conforming */
+        intsrc.mpc_srcbus = MP_ISA_BUS;
+        intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+        /* 
+         * Use the default configuration for the IRQs 0-15.  Unless
+         * overridden by (MADT) interrupt source override entries.
+         */
+        for (i = 0; i < 16; i++) {
+                int idx;
+                for (idx = 0; idx < mp_irq_entries; idx++) {
+                        struct mpc_config_intsrc *irq = mp_irqs + idx;
+                        /* Do we already have a mapping for this ISA IRQ? */
+                        if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
+                                break;
+                        /* Do we already have a mapping for this IOAPIC pin */
+                        if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
+                                (irq->mpc_dstirq == i))
+                                break;
+                }
+                if (idx != mp_irq_entries) {
+                        printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+                        continue;                       /* IRQ already used */
+                }
+                intsrc.mpc_irqtype = mp_INT;
+                intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
+                intsrc.mpc_dstirq = i;
+                Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
+                        "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+                        (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+                        intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
+                        intsrc.mpc_dstirq);
+                mp_irqs[mp_irq_entries] = intsrc;
+                if (++mp_irq_entries == MAX_IRQ_SOURCES)
+                        panic("Max # of irq sources exceeded!\n");
+        }
+        return;
+}
+int mp_register_gsi(u32 gsi, int edge_level, int active_high_low)
+{
+        int                     ioapic = -1;
+        int                     ioapic_pin = 0;
+        int                     idx, bit = 0;
+        if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+                return gsi;
+#ifdef CONFIG_ACPI_BUS
+        /* Don't set up the ACPI SCI because it's already set up */
+        if (acpi_fadt.sci_int == gsi)
+                return gsi;
+#endif
+        ioapic = mp_find_ioapic(gsi);
+        if (ioapic < 0) {
+                printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+                return gsi;
+        }
+        ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
+        /* 
+         * Avoid pin reprogramming.  PRTs typically include entries  
+         * with redundant pin->gsi mappings (but unique PCI devices);
+         * we only program the IOAPIC on the first.
+         */
+        bit = ioapic_pin % 32;
+        idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
+        if (idx > 3) {
+                printk(KERN_ERR "Invalid reference to IOAPIC pin "
+                        "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
+                        ioapic_pin);
+                return gsi;
+        }
+        if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
+                Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+                        mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+                return gsi;
+        }
+        mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
+        io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+                edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1,
+                active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1);
+        return gsi;
+}
+#endif /*CONFIG_X86_IO_APIC*/
+#endif /*CONFIG_ACPI_BOOT*/
diff --git a/arch/x86_64/kernel/msr.c b/arch/x86_64/kernel/msr.c
new file mode 100644
index 000000000000..598953ab0154
--- /dev/null
+++ b/arch/x86_64/kernel/msr.c
@@ -0,0 +1,279 @@
+/* ----------------------------------------------------------------------- *
+ *   
+ *   Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+/*
+ * msr.c
+ *
+ * x86 MSR access device
+ *
+ * This device is accessed by lseek() to the appropriate register number
+ * and then read/write in chunks of 8 bytes.  A larger size means multiple
+ * reads or writes of the same register.
+ *
+ * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
+ * an SMP box will direct the access to CPU %d.
+ */
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+/* Note: "err" is handled in a funny way below.  Otherwise one version
+   of gcc or another breaks. */
+static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx)
+{
+        int err;
+        asm volatile ("1:       wrmsr\n"
+                      "2:\n"
+                      ".section .fixup,\"ax\"\n"
+                      "3:       movl %4,%0\n"
+                      " jmp 2b\n"
+                      ".previous\n"
+                      ".section __ex_table,\"a\"\n"
+                      " .align 8\n" "   .quad 1b,3b\n" ".previous":"=&bDS" (err)
+                      :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0));
+        return err;
+}
+static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx)
+{
+        int err;
+        asm volatile ("1:       rdmsr\n"
+                      "2:\n"
+                      ".section .fixup,\"ax\"\n"
+                      "3:       movl %4,%0\n"
+                      " jmp 2b\n"
+                      ".previous\n"
+                      ".section __ex_table,\"a\"\n"
+                      " .align 8\n"
+                      " .quad 1b,3b\n"
+                      ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx)
+                      :"c"(reg), "i"(-EIO), "0"(0));
+        return err;
+}
+#ifdef CONFIG_SMP
+struct msr_command {
+        int cpu;
+        int err;
+        u32 reg;
+        u32 data[2];
+};
+static void msr_smp_wrmsr(void *cmd_block)
+{
+        struct msr_command *cmd = (struct msr_command *)cmd_block;
+        if (cmd->cpu == smp_processor_id())
+                cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
+}
+static void msr_smp_rdmsr(void *cmd_block)
+{
+        struct msr_command *cmd = (struct msr_command *)cmd_block;
+        if (cmd->cpu == smp_processor_id())
+                cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
+}
+static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
+{
+        struct msr_command cmd;
+        int ret;
+        preempt_disable();
+        if (cpu == smp_processor_id()) {
+                ret = wrmsr_eio(reg, eax, edx);
+        } else {
+                cmd.cpu = cpu;
+                cmd.reg = reg;
+                cmd.data[0] = eax;
+                cmd.data[1] = edx;
+                smp_call_function(msr_smp_wrmsr, &cmd, 1, 1);
+                ret = cmd.err;
+        }
+        preempt_enable();
+        return ret;
+}
+static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx)
+{
+        struct msr_command cmd;
+        int ret;
+        preempt_disable();
+        if (cpu == smp_processor_id()) {
+                ret = rdmsr_eio(reg, eax, edx);
+        } else {
+                cmd.cpu = cpu;
+                cmd.reg = reg;
+                smp_call_function(msr_smp_rdmsr, &cmd, 1, 1);
+                *eax = cmd.data[0];
+                *edx = cmd.data[1];
+                ret = cmd.err;
+        }
+        preempt_enable();
+        return ret;
+}
+#else                           /* ! CONFIG_SMP */
+static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
+{
+        return wrmsr_eio(reg, eax, edx);
+}
+static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx)
+{
+        return rdmsr_eio(reg, eax, edx);
+}
+#endif                          /* ! CONFIG_SMP */
+static loff_t msr_seek(struct file *file, loff_t offset, int orig)
+{
+        loff_t ret = -EINVAL;
+        lock_kernel();
+        switch (orig) {
+        case 0:
+                file->f_pos = offset;
+                ret = file->f_pos;
+                break;
+        case 1:
+                file->f_pos += offset;
+                ret = file->f_pos;
+        }
+        unlock_kernel();
+        return ret;
+}
+static ssize_t msr_read(struct file *file, char __user * buf,
+                        size_t count, loff_t * ppos)
+{
+        u32 __user *tmp = (u32 __user *) buf;
+        u32 data[2];
+        size_t rv;
+        u32 reg = *ppos;
+        int cpu = iminor(file->f_dentry->d_inode);
+        int err;
+        if (count % 8)
+                return -EINVAL; /* Invalid chunk size */
+        for (rv = 0; count; count -= 8) {
+                err = do_rdmsr(cpu, reg, &data[0], &data[1]);
+                if (err)
+                        return err;
+                if (copy_to_user(tmp, &data, 8))
+                        return -EFAULT;
+                tmp += 2;
+        }
+        return ((char __user *)tmp) - buf;
+}
+static ssize_t msr_write(struct file *file, const char __user *buf,
+                         size_t count, loff_t *ppos)
+{
+        const u32 __user *tmp = (const u32 __user *)buf;
+        u32 data[2];
+        size_t rv;
+        u32 reg = *ppos;
+        int cpu = iminor(file->f_dentry->d_inode);
+        int err;
+        if (count % 8)
+                return -EINVAL; /* Invalid chunk size */
+        for (rv = 0; count; count -= 8) {
+                if (copy_from_user(&data, tmp, 8))
+                        return -EFAULT;
+                err = do_wrmsr(cpu, reg, data[0], data[1]);
+                if (err)
+                        return err;
+                tmp += 2;
+        }
+        return ((char __user *)tmp) - buf;
+}
+static int msr_open(struct inode *inode, struct file *file)
+{
+        unsigned int cpu = iminor(file->f_dentry->d_inode);
+        struct cpuinfo_x86 *c = &(cpu_data)[cpu];
+        if (cpu >= NR_CPUS || !cpu_online(cpu))
+                return -ENXIO;  /* No such CPU */
+        if (!cpu_has(c, X86_FEATURE_MSR))
+                return -EIO;    /* MSR not supported */
+        return 0;
+}
+/*
+ * File operations we support
+ */
+static struct file_operations msr_fops = {
+        .owner = THIS_MODULE,
+        .llseek = msr_seek,
+        .read = msr_read,
+        .write = msr_write,
+        .open = msr_open,
+};
+static int __init msr_init(void)
+{
+        if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
+                printk(KERN_ERR "msr: unable to get major %d for msr\n",
+                       MSR_MAJOR);
+                return -EBUSY;
+        }
+        return 0;
+}
+static void __exit msr_exit(void)
+{
+        unregister_chrdev(MSR_MAJOR, "cpu/msr");
+}
+module_init(msr_init);
+module_exit(msr_exit)
+MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
+MODULE_DESCRIPTION("x86 generic MSR driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
new file mode 100644
index 000000000000..d9867de6a626
--- /dev/null
+++ b/arch/x86_64/kernel/nmi.c
@@ -0,0 +1,488 @@
+/*
+ *  linux/arch/x86_64/nmi.c
+ *
+ *  NMI watchdog support on APIC systems
+ *
+ *  Started by Ingo Molnar <mingo@redhat.com>
+ *
+ *  Fixes:
+ *  Mikael Pettersson   : AMD K7 support for local APIC NMI watchdog.
+ *  Mikael Pettersson   : Power Management for local APIC NMI watchdog.
+ *  Pavel Machek and
+ *  Mikael Pettersson   : PM converted to driver model. Disable/enable API.
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/nmi.h>
+#include <linux/sysctl.h>
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/nmi.h>
+#include <asm/msr.h>
+#include <asm/proto.h>
+#include <asm/kdebug.h>
+/*
+ * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
+ * - it may be reserved by some other driver, or not
+ * - when not reserved by some other driver, it may be used for
+ *   the NMI watchdog, or not
+ *
+ * This is maintained separately from nmi_active because the NMI
+ * watchdog may also be driven from the I/O APIC timer.
+ */
+static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
+static unsigned int lapic_nmi_owner;
+#define LAPIC_NMI_WATCHDOG      (1<<0)
+#define LAPIC_NMI_RESERVED      (1<<1)
+/* nmi_active:
+ * +1: the lapic NMI watchdog is active, but can be disabled
+ *  0: the lapic NMI watchdog has not been set up, and cannot
+ *     be enabled
+ * -1: the lapic NMI watchdog is disabled, but can be enabled
+ */
+int nmi_active;         /* oprofile uses this */
+int panic_on_timeout;
+unsigned int nmi_watchdog = NMI_DEFAULT;
+static unsigned int nmi_hz = HZ;
+unsigned int nmi_perfctr_msr;   /* the MSR to reset in NMI handler */
+/* Note that these events don't tick when the CPU idles. This means
+   the frequency varies with CPU load. */
+#define K7_EVNTSEL_ENABLE       (1 << 22)
+#define K7_EVNTSEL_INT          (1 << 20)
+#define K7_EVNTSEL_OS           (1 << 17)
+#define K7_EVNTSEL_USR          (1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING    0x76
+#define K7_NMI_EVENT            K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+#define P6_EVNTSEL0_ENABLE      (1 << 22)
+#define P6_EVNTSEL_INT          (1 << 20)
+#define P6_EVNTSEL_OS           (1 << 17)
+#define P6_EVNTSEL_USR          (1 << 16)
+#define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
+#define P6_NMI_EVENT            P6_EVENT_CPU_CLOCKS_NOT_HALTED
+/* Run after command line and cpu_init init, but before all other checks */
+void __init nmi_watchdog_default(void)
+{
+        if (nmi_watchdog != NMI_DEFAULT)
+                return;
+        /* For some reason the IO APIC watchdog doesn't work on the AMD
+           8111 chipset. For now switch to local APIC mode using
+           perfctr0 there.  On Intel CPUs we don't have code to handle
+           the perfctr and the IO-APIC seems to work, so use that.  */
+        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+                nmi_watchdog = NMI_LOCAL_APIC; 
+                printk(KERN_INFO 
+              "Using local APIC NMI watchdog using perfctr0\n");
+        } else {
+                printk(KERN_INFO "Using IO APIC NMI watchdog\n");
+                nmi_watchdog = NMI_IO_APIC;
+        }
+}
+/* Why is there no CPUID flag for this? */
+static __init int cpu_has_lapic(void)
+{
+        switch (boot_cpu_data.x86_vendor) { 
+        case X86_VENDOR_INTEL:
+        case X86_VENDOR_AMD: 
+                return boot_cpu_data.x86 >= 6; 
+        /* .... add more cpus here or find a different way to figure this out. */       
+        default:
+                return 0;
+        }       
+}
+int __init check_nmi_watchdog (void)
+{
+        int counts[NR_CPUS];
+        int cpu;
+        if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic())  {
+                nmi_watchdog = NMI_NONE;
+                return -1; 
+        }       
+        printk(KERN_INFO "testing NMI watchdog ... ");
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+                counts[cpu] = cpu_pda[cpu].__nmi_count; 
+        local_irq_enable();
+        mdelay((10*1000)/nmi_hz); // wait 10 ticks
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_SMP
+                /* Check cpu_callin_map here because that is set
+                   after the timer is started. */
+                if (!cpu_isset(cpu, cpu_callin_map))
+                        continue;
+#endif
+                if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
+                        printk("CPU#%d: NMI appears to be stuck (%d)!\n", 
+                               cpu,
+                               cpu_pda[cpu].__nmi_count);
+                        nmi_active = 0;
+                        lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
+                        return -1;
+                }
+        }
+        printk("OK.\n");
+        /* now that we know it works we can reduce NMI frequency to
+           something more reasonable; makes a difference in some configs */
+        if (nmi_watchdog == NMI_LOCAL_APIC)
+                nmi_hz = 1;
+        return 0;
+}
+int __init setup_nmi_watchdog(char *str)
+{
+        int nmi;
+        if (!strncmp(str,"panic",5)) {
+                panic_on_timeout = 1;
+                str = strchr(str, ',');
+                if (!str)
+                        return 1;
+                ++str;
+        }
+        get_option(&str, &nmi);
+        if (nmi >= NMI_INVALID)
+                return 0;
+                nmi_watchdog = nmi;
+        return 1;
+}
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+static void disable_lapic_nmi_watchdog(void)
+{
+        if (nmi_active <= 0)
+                return;
+        switch (boot_cpu_data.x86_vendor) {
+        case X86_VENDOR_AMD:
+                wrmsr(MSR_K7_EVNTSEL0, 0, 0);
+                break;
+        case X86_VENDOR_INTEL:
+                wrmsr(MSR_IA32_EVNTSEL0, 0, 0);
+                break;
+        }
+        nmi_active = -1;
+        /* tell do_nmi() and others that we're not active any more */
+        nmi_watchdog = 0;
+}
+static void enable_lapic_nmi_watchdog(void)
+{
+        if (nmi_active < 0) {
+                nmi_watchdog = NMI_LOCAL_APIC;
+                setup_apic_nmi_watchdog();
+        }
+}
+int reserve_lapic_nmi(void)
+{
+        unsigned int old_owner;
+        spin_lock(&lapic_nmi_owner_lock);
+        old_owner = lapic_nmi_owner;
+        lapic_nmi_owner |= LAPIC_NMI_RESERVED;
+        spin_unlock(&lapic_nmi_owner_lock);
+        if (old_owner & LAPIC_NMI_RESERVED)
+                return -EBUSY;
+        if (old_owner & LAPIC_NMI_WATCHDOG)
+                disable_lapic_nmi_watchdog();
+        return 0;
+}
+void release_lapic_nmi(void)
+{
+        unsigned int new_owner;
+        spin_lock(&lapic_nmi_owner_lock);
+        new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
+        lapic_nmi_owner = new_owner;
+        spin_unlock(&lapic_nmi_owner_lock);
+        if (new_owner & LAPIC_NMI_WATCHDOG)
+                enable_lapic_nmi_watchdog();
+}
+void disable_timer_nmi_watchdog(void)
+{
+        if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0))
+                return;
+        disable_irq(0);
+        unset_nmi_callback();
+        nmi_active = -1;
+        nmi_watchdog = NMI_NONE;
+}
+void enable_timer_nmi_watchdog(void)
+{
+        if (nmi_active < 0) {
+                nmi_watchdog = NMI_IO_APIC;
+                touch_nmi_watchdog();
+                nmi_active = 1;
+                enable_irq(0);
+        }
+}
+#ifdef CONFIG_PM
+static int nmi_pm_active; /* nmi_active before suspend */
+static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
+{
+        nmi_pm_active = nmi_active;
+        disable_lapic_nmi_watchdog();
+        return 0;
+}
+static int lapic_nmi_resume(struct sys_device *dev)
+{
+        if (nmi_pm_active > 0)
+        enable_lapic_nmi_watchdog();
+        return 0;
+}
+static struct sysdev_class nmi_sysclass = {
+        set_kset_name("lapic_nmi"),
+        .resume         = lapic_nmi_resume,
+        .suspend        = lapic_nmi_suspend,
+};
+static struct sys_device device_lapic_nmi = {
+        .id             = 0,
+        .cls    = &nmi_sysclass,
+};
+static int __init init_lapic_nmi_sysfs(void)
+{
+        int error;
+        if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC)
+                return 0;
+        error = sysdev_class_register(&nmi_sysclass);
+        if (!error)
+                error = sysdev_register(&device_lapic_nmi);
+        return error;
+}
+/* must come after the local APIC's device_initcall() */
+late_initcall(init_lapic_nmi_sysfs);
+#endif  /* CONFIG_PM */
+/*
+ * Activate the NMI watchdog via the local APIC.
+ * Original code written by Keith Owens.
+ */
+static void setup_k7_watchdog(void)
+{
+        int i;
+        unsigned int evntsel;
+        /* No check, so can start with slow frequency */
+        nmi_hz = 1; 
+        /* XXX should check these in EFER */
+        nmi_perfctr_msr = MSR_K7_PERFCTR0;
+        for(i = 0; i < 4; ++i) {
+                /* Simulator may not support it */
+                if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL))
+                        return;
+                wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
+        }
+        evntsel = K7_EVNTSEL_INT
+                | K7_EVNTSEL_OS
+                | K7_EVNTSEL_USR
+                | K7_NMI_EVENT;
+        wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+        wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz);
+        apic_write(APIC_LVTPC, APIC_DM_NMI);
+        evntsel |= K7_EVNTSEL_ENABLE;
+        wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+}
+void setup_apic_nmi_watchdog(void)
+{
+        switch (boot_cpu_data.x86_vendor) {
+        case X86_VENDOR_AMD:
+                if (boot_cpu_data.x86 < 6)
+                        return;
+                if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
+                        return;
+                setup_k7_watchdog();
+                break;
+        default:
+                return;
+        }
+        lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
+        nmi_active = 1;
+}
+/*
+ * the best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ *
+ * since NMIs don't listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk might lock
+ * up though, so we have to break up any console locks first ...
+ * [when there will be more tty-related locks, break them up
+ *  here too!]
+ */
+static unsigned int
+        last_irq_sums [NR_CPUS],
+        alert_counter [NR_CPUS];
+void touch_nmi_watchdog (void)
+{
+        int i;
+        /*
+         * Just reset the alert counters, (other CPUs might be
+         * spinning on locks we hold):
+         */
+        for (i = 0; i < NR_CPUS; i++)
+                alert_counter[i] = 0;
+}
+void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
+{
+        int sum, cpu;
+        cpu = safe_smp_processor_id();
+        sum = read_pda(apic_timer_irqs);
+        if (last_irq_sums[cpu] == sum) {
+                /*
+                 * Ayiee, looks like this CPU is stuck ...
+                 * wait a few IRQs (5 seconds) before doing the oops ...
+                 */
+                alert_counter[cpu]++;
+                if (alert_counter[cpu] == 5*nmi_hz) {
+                        if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+                                                        == NOTIFY_STOP) {
+                                alert_counter[cpu] = 0; 
+                                return;
+                        } 
+                        die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs);
+                }
+        } else {
+                last_irq_sums[cpu] = sum;
+                alert_counter[cpu] = 0;
+        }
+        if (nmi_perfctr_msr)
+                wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
+}
+static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
+{
+        return 0;
+}
+ 
+static nmi_callback_t nmi_callback = dummy_nmi_callback;
+ 
+asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
+{
+        int cpu = safe_smp_processor_id();
+        nmi_enter();
+        add_pda(__nmi_count,1);
+        if (!nmi_callback(regs, cpu))
+                default_do_nmi(regs);
+        nmi_exit();
+}
+void set_nmi_callback(nmi_callback_t callback)
+{
+        nmi_callback = callback;
+}
+void unset_nmi_callback(void)
+{
+        nmi_callback = dummy_nmi_callback;
+}
+#ifdef CONFIG_SYSCTL
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
+{
+        unsigned char reason = get_nmi_reason();
+        char buf[64];
+        if (!(reason & 0xc0)) {
+                sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+                die_nmi(buf,regs);
+        }
+        return 0;
+}
+/*
+ * proc handler for /proc/sys/kernel/unknown_nmi_panic
+ */
+int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file,
+                        void __user *buffer, size_t *length, loff_t *ppos)
+{
+        int old_state;
+        old_state = unknown_nmi_panic;
+        proc_dointvec(table, write, file, buffer, length, ppos);
+        if (!!old_state == !!unknown_nmi_panic)
+                return 0;
+        if (unknown_nmi_panic) {
+                if (reserve_lapic_nmi() < 0) {
+                        unknown_nmi_panic = 0;
+                        return -EBUSY;
+                } else {
+                        set_nmi_callback(unknown_nmi_panic_callback);
+                }
+        } else {
+                release_lapic_nmi();
+                unset_nmi_callback();
+        }
+        return 0;
+}
+#endif
+EXPORT_SYMBOL(nmi_active);
+EXPORT_SYMBOL(nmi_watchdog);
+EXPORT_SYMBOL(reserve_lapic_nmi);
+EXPORT_SYMBOL(release_lapic_nmi);
+EXPORT_SYMBOL(disable_timer_nmi_watchdog);
+EXPORT_SYMBOL(enable_timer_nmi_watchdog);
+EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
new file mode 100644
index 000000000000..cab471cf3edb
--- /dev/null
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -0,0 +1,60 @@
+/*
+ * Dynamic DMA mapping support.
+ */
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <asm/io.h>
+/* Map a set of buffers described by scatterlist in streaming
+ * mode for DMA.  This is the scatter-gather version of the
+ * above pci_map_single interface.  Here the scatter gather list
+ * elements are each tagged with the appropriate dma address
+ * and length.  They are obtained via sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for pci_map_single are
+ * the same here.
+ */
+int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
+               int nents, int direction)
+{
+        int i;
+        BUG_ON(direction == DMA_NONE);
+        for (i = 0; i < nents; i++ ) {
+                struct scatterlist *s = &sg[i];
+                BUG_ON(!s->page); 
+                s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
+                s->dma_length = s->length;
+        }
+        return nents;
+}
+EXPORT_SYMBOL(dma_map_sg);
+/* Unmap a set of streaming mode DMA translations.
+ * Again, cpu read rules concerning calls here are the same as for
+ * pci_unmap_single() above.
+ */
+void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
+                  int nents, int dir)
+{
+        int i;
+        for (i = 0; i < nents; i++) { 
+                struct scatterlist *s = &sg[i];
+                BUG_ON(s->page == NULL); 
+                BUG_ON(s->dma_address == 0); 
+                dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
+        } 
+}
+EXPORT_SYMBOL(dma_unmap_sg);
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
new file mode 100644
index 000000000000..57f35c68aa34
--- /dev/null
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -0,0 +1,980 @@
+/*
+ * Dynamic DMA mapping support for AMD Hammer.
+ * 
+ * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
+ * This allows to use PCI devices that only support 32bit addresses on systems
+ * with more than 4GB. 
+ *
+ * See Documentation/DMA-mapping.txt for the interface specification.
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/agp_backend.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <asm/atomic.h>
+#include <asm/io.h>
+#include <asm/mtrr.h>
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/cacheflush.h>
+#include <asm/kdebug.h>
+dma_addr_t bad_dma_address;
+unsigned long iommu_bus_base;   /* GART remapping area (physical) */
+static unsigned long iommu_size;        /* size of remapping area bytes */
+static unsigned long iommu_pages;       /* .. and in pages */
+u32 *iommu_gatt_base;           /* Remapping table */
+int no_iommu; 
+static int no_agp; 
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow = 1; 
+int force_iommu = 1;
+#else
+int panic_on_overflow = 0;
+int force_iommu = 0;
+#endif
+int iommu_merge = 1;
+int iommu_sac_force = 0; 
+/* If this is disabled the IOMMU will use an optimized flushing strategy
+   of only flushing when an mapping is reused. With it true the GART is flushed 
+   for every mapping. Problem is that doing the lazy flush seems to trigger
+   bugs with some popular PCI cards, in particular 3ware (but has been also
+   also seen with Qlogic at least). */
+int iommu_fullflush = 1;
+/* This tells the BIO block layer to assume merging. Default to off
+   because we cannot guarantee merging later. */
+int iommu_bio_merge = 0;
+#define MAX_NB 8
+/* Allocation bitmap for the remapping area */ 
+static DEFINE_SPINLOCK(iommu_bitmap_lock);
+static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
+static u32 gart_unmapped_entry; 
+#define GPTE_VALID    1
+#define GPTE_COHERENT 2
+#define GPTE_ENCODE(x) \
+        (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
+#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
+#define to_pages(addr,size) \
+        (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
+#define for_all_nb(dev) \
+        dev = NULL;     \
+        while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL)\
+             if (dev->bus->number == 0 &&                                    \
+                    (PCI_SLOT(dev->devfn) >= 24) && (PCI_SLOT(dev->devfn) <= 31))
+static struct pci_dev *northbridges[MAX_NB];
+static u32 northbridge_flush_word[MAX_NB];
+#define EMERGENCY_PAGES 32 /* = 128KB */ 
+#ifdef CONFIG_AGP
+#define AGPEXTERN extern
+#else
+#define AGPEXTERN
+#endif
+/* backdoor interface to AGP driver */
+AGPEXTERN int agp_memory_reserved;
+AGPEXTERN __u32 *agp_gatt_table;
+static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
+static int need_flush;          /* global flush state. set for each gart wrap */
+static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem,
+                               size_t size, int dir, int do_panic);
+/* Dummy device used for NULL arguments (normally ISA). Better would
+   be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */
+static struct device fallback_dev = {
+        .bus_id = "fallback device",
+        .coherent_dma_mask = 0xffffffff,
+        .dma_mask = &fallback_dev.coherent_dma_mask,
+};
+static unsigned long alloc_iommu(int size) 
+{       
+        unsigned long offset, flags;
+        spin_lock_irqsave(&iommu_bitmap_lock, flags);   
+        offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
+        if (offset == -1) {
+                need_flush = 1;
+                offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size);
+        }
+        if (offset != -1) { 
+                set_bit_string(iommu_gart_bitmap, offset, size); 
+                next_bit = offset+size; 
+                if (next_bit >= iommu_pages) { 
+                        next_bit = 0;
+                        need_flush = 1;
+                } 
+        } 
+        if (iommu_fullflush)
+                need_flush = 1;
+        spin_unlock_irqrestore(&iommu_bitmap_lock, flags);      
+        return offset;
+} 
+static void free_iommu(unsigned long offset, int size)
+{ 
+        unsigned long flags;
+        if (size == 1) { 
+                clear_bit(offset, iommu_gart_bitmap); 
+                return;
+        }
+        spin_lock_irqsave(&iommu_bitmap_lock, flags);
+        __clear_bit_string(iommu_gart_bitmap, offset, size);
+        spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+} 
+/* 
+ * Use global flush state to avoid races with multiple flushers.
+ */
+static void flush_gart(struct device *dev)
+{ 
+        unsigned long flags;
+        int flushed = 0;
+        int i, max;
+        spin_lock_irqsave(&iommu_bitmap_lock, flags);
+        if (need_flush) { 
+                max = 0;
+                for (i = 0; i < MAX_NB; i++) {
+                        if (!northbridges[i]) 
+                                continue;
+                        pci_write_config_dword(northbridges[i], 0x9c, 
+                                               northbridge_flush_word[i] | 1); 
+                        flushed++;
+                        max = i;
+                }
+                for (i = 0; i <= max; i++) {
+                        u32 w;
+                        if (!northbridges[i])
+                                continue;
+                        /* Make sure the hardware actually executed the flush. */
+                        do { 
+                                pci_read_config_dword(northbridges[i], 0x9c, &w);
+                        } while (w & 1);
+                } 
+                if (!flushed) 
+                        printk("nothing to flush?\n");
+                need_flush = 0;
+        } 
+        spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+} 
+/* Allocate DMA memory on node near device */
+noinline
+static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order)
+{
+        struct page *page;
+        int node;
+        if (dev->bus == &pci_bus_type) {
+                cpumask_t mask;
+                mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
+                node = cpu_to_node(first_cpu(mask));
+        } else
+                node = numa_node_id();
+        page = alloc_pages_node(node, gfp, order);
+        return page ? page_address(page) : NULL;
+}
+/* 
+ * Allocate memory for a coherent mapping.
+ */
+void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+                   unsigned gfp)
+{
+        void *memory;
+        unsigned long dma_mask = 0;
+        u64 bus;
+        if (!dev)
+                dev = &fallback_dev;
+        dma_mask = dev->coherent_dma_mask;
+        if (dma_mask == 0) 
+                dma_mask = 0xffffffff; 
+        /* Kludge to make it bug-to-bug compatible with i386. i386
+           uses the normal dma_mask for alloc_coherent. */
+        dma_mask &= *dev->dma_mask;
+ again:
+        memory = dma_alloc_pages(dev, gfp, get_order(size));
+        if (memory == NULL)
+                return NULL;
+        {
+                int high, mmu;
+                bus = virt_to_bus(memory);
+                high = (bus + size) >= dma_mask;
+                mmu = high;
+                if (force_iommu && !(gfp & GFP_DMA)) 
+                        mmu = 1;
+                if (no_iommu || dma_mask < 0xffffffffUL) { 
+                        if (high) {
+                                free_pages((unsigned long)memory,
+                                           get_order(size));
+                                if (swiotlb) {
+                                        return
+                                        swiotlb_alloc_coherent(dev, size,
+                                                               dma_handle,
+                                                               gfp);
+                                }
+                                if (!(gfp & GFP_DMA)) { 
+                                        gfp |= GFP_DMA; 
+                                        goto again;
+                                }
+                                return NULL;
+                        }
+                        mmu = 0; 
+                }       
+                memset(memory, 0, size); 
+                if (!mmu) { 
+                        *dma_handle = virt_to_bus(memory);
+                        return memory;
+                }
+        } 
+        *dma_handle = dma_map_area(dev, bus, size, PCI_DMA_BIDIRECTIONAL, 0);
+        if (*dma_handle == bad_dma_address)
+                goto error; 
+        flush_gart(dev);
+        return memory; 
+        
+error:
+        if (panic_on_overflow)
+                panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", size);
+        free_pages((unsigned long)memory, get_order(size)); 
+        return NULL; 
+}
+/* 
+ * Unmap coherent memory.
+ * The caller must ensure that the device has finished accessing the mapping.
+ */
+void dma_free_coherent(struct device *dev, size_t size,
+                         void *vaddr, dma_addr_t bus)
+{
+        if (swiotlb) {
+                swiotlb_free_coherent(dev, size, vaddr, bus);
+                return;
+        }
+        dma_unmap_single(dev, bus, size, 0);
+        free_pages((unsigned long)vaddr, get_order(size));              
+}
+#ifdef CONFIG_IOMMU_LEAK
+#define SET_LEAK(x) if (iommu_leak_tab) \
+                        iommu_leak_tab[x] = __builtin_return_address(0);
+#define CLEAR_LEAK(x) if (iommu_leak_tab) \
+                        iommu_leak_tab[x] = NULL;
+/* Debugging aid for drivers that don't free their IOMMU tables */
+static void **iommu_leak_tab; 
+static int leak_trace;
+int iommu_leak_pages = 20; 
+void dump_leak(void)
+{
+        int i;
+        static int dump; 
+        if (dump || !iommu_leak_tab) return;
+        dump = 1;
+        show_stack(NULL,NULL);
+        /* Very crude. dump some from the end of the table too */ 
+        printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); 
+        for (i = 0; i < iommu_leak_pages; i+=2) {
+                printk("%lu: ", iommu_pages-i);
+                printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
+                printk("%c", (i+1)%2 == 0 ? '\n' : ' '); 
+        } 
+        printk("\n");
+}
+#else
+#define SET_LEAK(x)
+#define CLEAR_LEAK(x)
+#endif
+static void iommu_full(struct device *dev, size_t size, int dir, int do_panic)
+{
+        /* 
+         * Ran out of IOMMU space for this operation. This is very bad.
+         * Unfortunately the drivers cannot handle this operation properly.
+         * Return some non mapped prereserved space in the aperture and 
+         * let the Northbridge deal with it. This will result in garbage
+         * in the IO operation. When the size exceeds the prereserved space
+         * memory corruption will occur or random memory will be DMAed 
+         * out. Hopefully no network devices use single mappings that big.
+         */ 
+        
+        printk(KERN_ERR 
+  "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
+               size, dev->bus_id);
+        if (size > PAGE_SIZE*EMERGENCY_PAGES && do_panic) {
+                if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+                        panic("PCI-DMA: Memory would be corrupted\n");
+                if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) 
+                        panic("PCI-DMA: Random memory would be DMAed\n");
+        } 
+#ifdef CONFIG_IOMMU_LEAK
+        dump_leak(); 
+#endif
+} 
+static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
+{ 
+        u64 mask = *dev->dma_mask;
+        int high = addr + size >= mask;
+        int mmu = high;
+        if (force_iommu) 
+                mmu = 1; 
+        if (no_iommu) { 
+                if (high) 
+                        panic("PCI-DMA: high address but no IOMMU.\n"); 
+                mmu = 0; 
+        }       
+        return mmu; 
+}
+static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
+{ 
+        u64 mask = *dev->dma_mask;
+        int high = addr + size >= mask;
+        int mmu = high;
+        if (no_iommu) { 
+                if (high) 
+                        panic("PCI-DMA: high address but no IOMMU.\n"); 
+                mmu = 0; 
+        }       
+        return mmu; 
+}
+/* Map a single continuous physical area into the IOMMU.
+ * Caller needs to check if the iommu is needed and flush.
+ */
+static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem,
+                                size_t size, int dir, int do_panic)
+{ 
+        unsigned long npages = to_pages(phys_mem, size);
+        unsigned long iommu_page = alloc_iommu(npages);
+        int i;
+        if (iommu_page == -1) {
+                if (!nonforced_iommu(dev, phys_mem, size))
+                        return phys_mem; 
+                if (panic_on_overflow)
+                        panic("dma_map_area overflow %lu bytes\n", size);
+                iommu_full(dev, size, dir, do_panic);
+                return bad_dma_address;
+        }
+        for (i = 0; i < npages; i++) {
+                iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
+                SET_LEAK(iommu_page + i);
+                phys_mem += PAGE_SIZE;
+        }
+        return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
+}
+/* Map a single area into the IOMMU */
+dma_addr_t dma_map_single(struct device *dev, void *addr, size_t size, int dir)
+{
+        unsigned long phys_mem, bus;
+        BUG_ON(dir == DMA_NONE);
+        if (swiotlb)
+                return swiotlb_map_single(dev,addr,size,dir);
+        if (!dev)
+                dev = &fallback_dev;
+        phys_mem = virt_to_phys(addr); 
+        if (!need_iommu(dev, phys_mem, size))
+                return phys_mem; 
+        bus = dma_map_area(dev, phys_mem, size, dir, 1);
+        flush_gart(dev); 
+        return bus; 
+} 
+/* Fallback for dma_map_sg in case of overflow */
+static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
+                               int nents, int dir)
+{
+        int i;
+#ifdef CONFIG_IOMMU_DEBUG
+        printk(KERN_DEBUG "dma_map_sg overflow\n");
+#endif
+        for (i = 0; i < nents; i++ ) {
+                struct scatterlist *s = &sg[i];
+                unsigned long addr = page_to_phys(s->page) + s->offset; 
+                if (nonforced_iommu(dev, addr, s->length)) { 
+                        addr = dma_map_area(dev, addr, s->length, dir, 0);
+                        if (addr == bad_dma_address) { 
+                                if (i > 0) 
+                                        dma_unmap_sg(dev, sg, i, dir);
+                                nents = 0; 
+                                sg[0].dma_length = 0;
+                                break;
+                        }
+                }
+                s->dma_address = addr;
+                s->dma_length = s->length;
+        }
+        flush_gart(dev);
+        return nents;
+}
+/* Map multiple scatterlist entries continuous into the first. */
+static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
+                      struct scatterlist *sout, unsigned long pages)
+{
+        unsigned long iommu_start = alloc_iommu(pages);
+        unsigned long iommu_page = iommu_start; 
+        int i;
+        if (iommu_start == -1)
+                return -1;
+        
+        for (i = start; i < stopat; i++) {
+                struct scatterlist *s = &sg[i];
+                unsigned long pages, addr;
+                unsigned long phys_addr = s->dma_address;
+                
+                BUG_ON(i > start && s->offset);
+                if (i == start) {
+                        *sout = *s; 
+                        sout->dma_address = iommu_bus_base;
+                        sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
+                        sout->dma_length = s->length;
+                } else { 
+                        sout->dma_length += s->length; 
+                }
+                addr = phys_addr;
+                pages = to_pages(s->offset, s->length); 
+                while (pages--) { 
+                        iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 
+                        SET_LEAK(iommu_page);
+                        addr += PAGE_SIZE;
+                        iommu_page++;
+        } 
+        } 
+        BUG_ON(iommu_page - iommu_start != pages);      
+        return 0;
+}
+static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
+                      struct scatterlist *sout,
+                      unsigned long pages, int need)
+{
+        if (!need) { 
+                BUG_ON(stopat - start != 1);
+                *sout = sg[start]; 
+                sout->dma_length = sg[start].length; 
+                return 0;
+        } 
+        return __dma_map_cont(sg, start, stopat, sout, pages);
+}
+                
+/*
+ * DMA map all entries in a scatterlist.
+ * Merge chunks that have page aligned sizes into a continuous mapping. 
+ */
+int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+{
+        int i;
+        int out;
+        int start;
+        unsigned long pages = 0;
+        int need = 0, nextneed;
+        BUG_ON(dir == DMA_NONE);
+        if (nents == 0) 
+                return 0;
+        if (swiotlb)
+                return swiotlb_map_sg(dev,sg,nents,dir);
+        if (!dev)
+                dev = &fallback_dev;
+        out = 0;
+        start = 0;
+        for (i = 0; i < nents; i++) {
+                struct scatterlist *s = &sg[i];
+                dma_addr_t addr = page_to_phys(s->page) + s->offset;
+                s->dma_address = addr;
+                BUG_ON(s->length == 0); 
+                nextneed = need_iommu(dev, addr, s->length); 
+                /* Handle the previous not yet processed entries */
+                if (i > start) {
+                        struct scatterlist *ps = &sg[i-1];
+                        /* Can only merge when the last chunk ends on a page 
+                           boundary and the new one doesn't have an offset. */
+                        if (!iommu_merge || !nextneed || !need || s->offset ||
+                            (ps->offset + ps->length) % PAGE_SIZE) { 
+                                if (dma_map_cont(sg, start, i, sg+out, pages,
+                                                 need) < 0)
+                                        goto error;
+                                out++;
+                                pages = 0;
+                                start = i;      
+                        }
+                }
+                need = nextneed;
+                pages += to_pages(s->offset, s->length);
+        }
+        if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
+                goto error;
+        out++;
+        flush_gart(dev);
+        if (out < nents) 
+                sg[out].dma_length = 0; 
+        return out;
+error:
+        flush_gart(NULL);
+        dma_unmap_sg(dev, sg, nents, dir);
+        /* When it was forced try again unforced */
+        if (force_iommu) 
+                return dma_map_sg_nonforce(dev, sg, nents, dir);
+        if (panic_on_overflow)
+                panic("dma_map_sg: overflow on %lu pages\n", pages);
+        iommu_full(dev, pages << PAGE_SHIFT, dir, 0);
+        for (i = 0; i < nents; i++)
+                sg[i].dma_address = bad_dma_address;
+        return 0;
+} 
+/*
+ * Free a DMA mapping.
+ */ 
+void dma_unmap_single(struct device *dev, dma_addr_t dma_addr,
+                      size_t size, int direction)
+{
+        unsigned long iommu_page; 
+        int npages;
+        int i;
+        if (swiotlb) {
+                swiotlb_unmap_single(dev,dma_addr,size,direction);
+                return;
+        }
+        if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || 
+            dma_addr >= iommu_bus_base + iommu_size)
+                return;
+        iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;   
+        npages = to_pages(dma_addr, size);
+        for (i = 0; i < npages; i++) { 
+                iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 
+                CLEAR_LEAK(iommu_page + i);
+        }
+        free_iommu(iommu_page, npages);
+}
+/* 
+ * Wrapper for pci_unmap_single working with scatterlists.
+ */ 
+void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+{
+        int i;
+        if (swiotlb) {
+                swiotlb_unmap_sg(dev,sg,nents,dir);
+                return;
+        }
+        for (i = 0; i < nents; i++) { 
+                struct scatterlist *s = &sg[i];
+                if (!s->dma_length || !s->length) 
+                        break;
+                dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
+        }
+}
+int dma_supported(struct device *dev, u64 mask)
+{
+        /* Copied from i386. Doesn't make much sense, because it will 
+           only work for pci_alloc_coherent.
+           The caller just has to use GFP_DMA in this case. */
+        if (mask < 0x00ffffff)
+                return 0;
+        /* Tell the device to use SAC when IOMMU force is on. 
+           This allows the driver to use cheaper accesses in some cases.
+           Problem with this is that if we overflow the IOMMU area
+           and return DAC as fallback address the device may not handle it correctly.
+           
+           As a special case some controllers have a 39bit address mode 
+           that is as efficient as 32bit (aic79xx). Don't force SAC for these.
+           Assume all masks <= 40 bits are of this type. Normally this doesn't
+           make any difference, but gives more gentle handling of IOMMU overflow. */
+        if (iommu_sac_force && (mask >= 0xffffffffffULL)) { 
+                printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
+                return 0; 
+        }
+        return 1;
+} 
+int dma_get_cache_alignment(void)
+{
+        return boot_cpu_data.x86_clflush_size;
+}
+EXPORT_SYMBOL(dma_unmap_sg);
+EXPORT_SYMBOL(dma_map_sg);
+EXPORT_SYMBOL(dma_map_single);
+EXPORT_SYMBOL(dma_unmap_single);
+EXPORT_SYMBOL(dma_supported);
+EXPORT_SYMBOL(no_iommu);
+EXPORT_SYMBOL(force_iommu); 
+EXPORT_SYMBOL(bad_dma_address);
+EXPORT_SYMBOL(iommu_bio_merge);
+EXPORT_SYMBOL(iommu_sac_force);
+EXPORT_SYMBOL(dma_get_cache_alignment);
+EXPORT_SYMBOL(dma_alloc_coherent);
+EXPORT_SYMBOL(dma_free_coherent);
+static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
+{ 
+        unsigned long a; 
+        if (!iommu_size) { 
+                iommu_size = aper_size; 
+                if (!no_agp) 
+                        iommu_size /= 2; 
+        } 
+        a = aper + iommu_size; 
+        iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
+        if (iommu_size < 64*1024*1024) 
+                printk(KERN_WARNING
+  "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); 
+        
+        return iommu_size;
+} 
+static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) 
+{ 
+        unsigned aper_size = 0, aper_base_32;
+        u64 aper_base;
+        unsigned aper_order;
+        pci_read_config_dword(dev, 0x94, &aper_base_32); 
+        pci_read_config_dword(dev, 0x90, &aper_order);
+        aper_order = (aper_order >> 1) & 7;     
+        aper_base = aper_base_32 & 0x7fff; 
+        aper_base <<= 25;
+        aper_size = (32 * 1024 * 1024) << aper_order; 
+        if (aper_base + aper_size >= 0xffffffff || !aper_size)
+                aper_base = 0;
+        *size = aper_size;
+        return aper_base;
+} 
+/* 
+ * Private Northbridge GATT initialization in case we cannot use the
+ * AGP driver for some reason.  
+ */
+static __init int init_k8_gatt(struct agp_kern_info *info)
+{ 
+        struct pci_dev *dev;
+        void *gatt;
+        unsigned aper_base, new_aper_base;
+        unsigned aper_size, gatt_size, new_aper_size;
+        
+        printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
+        aper_size = aper_base = info->aper_size = 0;
+        for_all_nb(dev) { 
+                new_aper_base = read_aperture(dev, &new_aper_size); 
+                if (!new_aper_base) 
+                        goto nommu; 
+                
+                if (!aper_base) { 
+                        aper_size = new_aper_size;
+                        aper_base = new_aper_base;
+                } 
+                if (aper_size != new_aper_size || aper_base != new_aper_base) 
+                        goto nommu;
+        }
+        if (!aper_base)
+                goto nommu; 
+        info->aper_base = aper_base;
+        info->aper_size = aper_size>>20; 
+        gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 
+        gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 
+        if (!gatt) 
+                panic("Cannot allocate GATT table"); 
+        memset(gatt, 0, gatt_size); 
+        agp_gatt_table = gatt;
+        
+        for_all_nb(dev) { 
+                u32 ctl; 
+                u32 gatt_reg; 
+                gatt_reg = __pa(gatt) >> 12; 
+                gatt_reg <<= 4; 
+                pci_write_config_dword(dev, 0x98, gatt_reg);
+                pci_read_config_dword(dev, 0x90, &ctl); 
+                ctl |= 1;
+                ctl &= ~((1<<4) | (1<<5));
+                pci_write_config_dword(dev, 0x90, ctl); 
+        }
+        flush_gart(NULL); 
+        
+        printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 
+        return 0;
+ nommu:
+        /* Should not happen anymore */
+        printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
+               KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction."); 
+        return -1; 
+} 
+extern int agp_amd64_init(void);
+static int __init pci_iommu_init(void)
+{ 
+        struct agp_kern_info info;
+        unsigned long aper_size;
+        unsigned long iommu_start;
+        struct pci_dev *dev;
+        unsigned long scratch;
+        long i;
+#ifndef CONFIG_AGP_AMD64
+        no_agp = 1; 
+#else
+        /* Makefile puts PCI initialization via subsys_initcall first. */
+        /* Add other K8 AGP bridge drivers here */
+        no_agp = no_agp || 
+                (agp_amd64_init() < 0) || 
+                (agp_copy_info(agp_bridge, &info) < 0);
+#endif  
+        if (swiotlb) { 
+                no_iommu = 1;
+                printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+                return -1; 
+        } 
+        
+        if (no_iommu ||
+            (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT) ||
+            !iommu_aperture ||
+            (no_agp && init_k8_gatt(&info) < 0)) {
+                printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); 
+                no_iommu = 1;
+                return -1;
+        }
+        aper_size = info.aper_size * 1024 * 1024;       
+        iommu_size = check_iommu_size(info.aper_base, aper_size); 
+        iommu_pages = iommu_size >> PAGE_SHIFT; 
+        iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, 
+                                                    get_order(iommu_pages/8)); 
+        if (!iommu_gart_bitmap) 
+                panic("Cannot allocate iommu bitmap\n"); 
+        memset(iommu_gart_bitmap, 0, iommu_pages/8);
+#ifdef CONFIG_IOMMU_LEAK
+        if (leak_trace) { 
+                iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 
+                                  get_order(iommu_pages*sizeof(void *)));
+                if (iommu_leak_tab) 
+                        memset(iommu_leak_tab, 0, iommu_pages * 8); 
+                else
+                        printk("PCI-DMA: Cannot allocate leak trace area\n"); 
+        } 
+#endif
+        /* 
+         * Out of IOMMU space handling.
+         * Reserve some invalid pages at the beginning of the GART. 
+         */ 
+        set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 
+        agp_memory_reserved = iommu_size;       
+        printk(KERN_INFO
+               "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
+               iommu_size>>20); 
+        iommu_start = aper_size - iommu_size;   
+        iommu_bus_base = info.aper_base + iommu_start; 
+        bad_dma_address = iommu_bus_base;
+        iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+        /* 
+         * Unmap the IOMMU part of the GART. The alias of the page is
+         * always mapped with cache enabled and there is no full cache
+         * coherency across the GART remapping. The unmapping avoids
+         * automatic prefetches from the CPU allocating cache lines in
+         * there. All CPU accesses are done via the direct mapping to
+         * the backing memory. The GART address is only used by PCI
+         * devices. 
+         */
+        clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
+        /* 
+         * Try to workaround a bug (thanks to BenH) 
+         * Set unmapped entries to a scratch page instead of 0. 
+         * Any prefetches that hit unmapped entries won't get an bus abort
+         * then.
+         */
+        scratch = get_zeroed_page(GFP_KERNEL); 
+        if (!scratch) 
+                panic("Cannot allocate iommu scratch page");
+        gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
+        for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 
+                iommu_gatt_base[i] = gart_unmapped_entry;
+        for_all_nb(dev) {
+                u32 flag; 
+                int cpu = PCI_SLOT(dev->devfn) - 24;
+                if (cpu >= MAX_NB)
+                        continue;
+                northbridges[cpu] = dev;
+                pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */
+                northbridge_flush_word[cpu] = flag; 
+        }
+                     
+        flush_gart(NULL);
+        return 0;
+} 
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
+/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
+         [,forcesac][,fullflush][,nomerge][,biomerge]
+   size  set size of iommu (in bytes) 
+   noagp don't initialize the AGP driver and use full aperture.
+   off   don't use the IOMMU
+   leak  turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
+   memaper[=order] allocate an own aperture over RAM with size 32MB^order.  
+   noforce don't force IOMMU usage. Default.
+   force  Force IOMMU.
+   merge  Do lazy merging. This may improve performance on some block devices.
+          Implies force (experimental)
+   biomerge Do merging at the BIO layer. This is more efficient than merge,
+            but should be only done with very big IOMMUs. Implies merge,force.
+   nomerge Don't do SG merging.
+   forcesac For SAC mode for masks <40bits  (experimental)
+   fullflush Flush IOMMU on each allocation (default) 
+   nofullflush Don't use IOMMU fullflush
+   allowed  overwrite iommu off workarounds for specific chipsets.
+   soft  Use software bounce buffering (default for Intel machines)
+   noaperture Don't touch the aperture for AGP.
+*/
+__init int iommu_setup(char *p)
+{ 
+    int arg;
+    while (*p) {
+            if (!strncmp(p,"noagp",5))
+                    no_agp = 1;
+            if (!strncmp(p,"off",3))
+                    no_iommu = 1;
+            if (!strncmp(p,"force",5)) {
+                    force_iommu = 1;
+                    iommu_aperture_allowed = 1;
+            }
+            if (!strncmp(p,"allowed",7))
+                    iommu_aperture_allowed = 1;
+            if (!strncmp(p,"noforce",7)) {
+                    iommu_merge = 0;
+                    force_iommu = 0;
+            }
+            if (!strncmp(p, "memaper", 7)) {
+                    fallback_aper_force = 1; 
+                    p += 7; 
+                    if (*p == '=') {
+                            ++p;
+                            if (get_option(&p, &arg))
+                                    fallback_aper_order = arg;
+                    }
+            } 
+            if (!strncmp(p, "biomerge",8)) {
+                    iommu_bio_merge = 4096;
+                    iommu_merge = 1;
+                    force_iommu = 1;
+            }
+            if (!strncmp(p, "panic",5))
+                    panic_on_overflow = 1;
+            if (!strncmp(p, "nopanic",7))
+                    panic_on_overflow = 0;          
+            if (!strncmp(p, "merge",5)) {
+                    iommu_merge = 1;
+                    force_iommu = 1; 
+            }
+            if (!strncmp(p, "nomerge",7))
+                    iommu_merge = 0;
+            if (!strncmp(p, "forcesac",8))
+                    iommu_sac_force = 1;
+            if (!strncmp(p, "fullflush",8))
+                    iommu_fullflush = 1;
+            if (!strncmp(p, "nofullflush",11))
+                    iommu_fullflush = 0;
+            if (!strncmp(p, "soft",4))
+                    swiotlb = 1;
+            if (!strncmp(p, "noaperture",10))
+                    fix_aperture = 0;
+#ifdef CONFIG_IOMMU_LEAK
+            if (!strncmp(p,"leak",4)) {
+                    leak_trace = 1;
+                    p += 4; 
+                    if (*p == '=') ++p;
+                    if (isdigit(*p) && get_option(&p, &arg))
+                            iommu_leak_pages = arg;
+            } else
+#endif
+            if (isdigit(*p) && get_option(&p, &arg)) 
+                    iommu_size = arg;
+            p += strcspn(p, ",");
+            if (*p == ',')
+                    ++p;
+    }
+    return 1;
+} 
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
new file mode 100644
index 000000000000..67d90b89af0b
--- /dev/null
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -0,0 +1,94 @@
+/* Fallback functions when the main IOMMU code is not compiled in. This
+   code is roughly equivalent to i386. */
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <asm/proto.h>
+#include <asm/processor.h>
+int iommu_merge = 0;
+EXPORT_SYMBOL(iommu_merge);
+dma_addr_t bad_dma_address;
+EXPORT_SYMBOL(bad_dma_address);
+int iommu_bio_merge = 0;
+EXPORT_SYMBOL(iommu_bio_merge);
+int iommu_sac_force = 0;
+EXPORT_SYMBOL(iommu_sac_force);
+/* 
+ * Dummy IO MMU functions
+ */
+void *dma_alloc_coherent(struct device *hwdev, size_t size,
+                         dma_addr_t *dma_handle, unsigned gfp)
+{
+        void *ret;
+        u64 mask;
+        int order = get_order(size);
+        if (hwdev)
+                mask = hwdev->coherent_dma_mask & *hwdev->dma_mask;
+        else
+                mask = 0xffffffff;
+        for (;;) {
+                ret = (void *)__get_free_pages(gfp, order);
+                if (ret == NULL)
+                        return NULL;
+                *dma_handle = virt_to_bus(ret);
+                if ((*dma_handle & ~mask) == 0)
+                        break;
+                free_pages((unsigned long)ret, order);
+                if (gfp & GFP_DMA)
+                        return NULL;
+                gfp |= GFP_DMA;
+        }
+        memset(ret, 0, size);
+        return ret;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+void dma_free_coherent(struct device *hwdev, size_t size,
+                         void *vaddr, dma_addr_t dma_handle)
+{
+        free_pages((unsigned long)vaddr, get_order(size));
+}
+EXPORT_SYMBOL(dma_free_coherent);
+int dma_supported(struct device *hwdev, u64 mask)
+{
+        /*
+         * we fall back to GFP_DMA when the mask isn't all 1s,
+         * so we can't guarantee allocations that must be
+         * within a tighter range than GFP_DMA..
+         * RED-PEN this won't work for pci_map_single. Caller has to
+         * use GFP_DMA in the first place.
+         */
+        if (mask < 0x00ffffff)
+                return 0;
+        return 1;
+} 
+EXPORT_SYMBOL(dma_supported);
+int dma_get_cache_alignment(void)
+{
+        return boot_cpu_data.x86_clflush_size;
+}
+EXPORT_SYMBOL(dma_get_cache_alignment);
+static int __init check_ram(void) 
+{ 
+        if (end_pfn >= 0xffffffff>>PAGE_SHIFT) { 
+                printk(
+                KERN_ERR "WARNING more than 4GB of memory but IOMMU not compiled in.\n"
+                KERN_ERR "WARNING 32bit PCI may malfunction.\n");
+        } 
+        return 0;
+} 
+__initcall(check_ram);
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
new file mode 100644
index 000000000000..9922d2ba24a3
--- /dev/null
+++ b/arch/x86_64/kernel/process.c
@@ -0,0 +1,770 @@
+/*
+ *  linux/arch/x86-64/kernel/process.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *      Gareth Hughes <gareth@valinux.com>, May 2000
+ * 
+ *  X86-64 port
+ *      Andi Kleen.
+ * 
+ *  $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
+ */
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+#include <stdarg.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/module.h>
+#include <linux/a.out.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/ptrace.h>
+#include <linux/utsname.h>
+#include <linux/random.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/mmu_context.h>
+#include <asm/pda.h>
+#include <asm/prctl.h>
+#include <asm/kdebug.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/ia32.h>
+asmlinkage extern void ret_from_fork(void);
+unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
+static atomic_t hlt_counter = ATOMIC_INIT(0);
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
+void disable_hlt(void)
+{
+        atomic_inc(&hlt_counter);
+}
+EXPORT_SYMBOL(disable_hlt);
+void enable_hlt(void)
+{
+        atomic_dec(&hlt_counter);
+}
+EXPORT_SYMBOL(enable_hlt);
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+void default_idle(void)
+{
+        if (!atomic_read(&hlt_counter)) {
+                local_irq_disable();
+                if (!need_resched())
+                        safe_halt();
+                else
+                        local_irq_enable();
+        }
+}
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle (void)
+{
+        int oldval;
+        local_irq_enable();
+        /*
+         * Deal with another CPU just having chosen a thread to
+         * run here:
+         */
+        oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
+        if (!oldval) {
+                set_thread_flag(TIF_POLLING_NRFLAG); 
+                asm volatile(
+                        "2:"
+                        "testl %0,%1;"
+                        "rep; nop;"
+                        "je 2b;"
+                        : :
+                        "i" (_TIF_NEED_RESCHED), 
+                        "m" (current_thread_info()->flags));
+        } else {
+                set_need_resched();
+        }
+}
+void cpu_idle_wait(void)
+{
+        unsigned int cpu, this_cpu = get_cpu();
+        cpumask_t map;
+        set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
+        put_cpu();
+        cpus_clear(map);
+        for_each_online_cpu(cpu) {
+                per_cpu(cpu_idle_state, cpu) = 1;
+                cpu_set(cpu, map);
+        }
+        __get_cpu_var(cpu_idle_state) = 0;
+        wmb();
+        do {
+                ssleep(1);
+                for_each_online_cpu(cpu) {
+                        if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+                                cpu_clear(cpu, map);
+                }
+                cpus_and(map, map, cpu_online_map);
+        } while (!cpus_empty(map));
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle (void)
+{
+        /* endless idle loop with no priority at all */
+        while (1) {
+                while (!need_resched()) {
+                        void (*idle)(void);
+                        if (__get_cpu_var(cpu_idle_state))
+                                __get_cpu_var(cpu_idle_state) = 0;
+                        rmb();
+                        idle = pm_idle;
+                        if (!idle)
+                                idle = default_idle;
+                        idle();
+                }
+                schedule();
+        }
+}
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ */
+static void mwait_idle(void)
+{
+        local_irq_enable();
+        if (!need_resched()) {
+                set_thread_flag(TIF_POLLING_NRFLAG);
+                do {
+                        __monitor((void *)&current_thread_info()->flags, 0, 0);
+                        if (need_resched())
+                                break;
+                        __mwait(0, 0);
+                } while (!need_resched());
+                clear_thread_flag(TIF_POLLING_NRFLAG);
+        }
+}
+void __init select_idle_routine(const struct cpuinfo_x86 *c)
+{
+        static int printed;
+        if (cpu_has(c, X86_FEATURE_MWAIT)) {
+                /*
+                 * Skip, if setup has overridden idle.
+                 * One CPU supports mwait => All CPUs supports mwait
+                 */
+                if (!pm_idle) {
+                        if (!printed) {
+                                printk("using mwait in idle threads.\n");
+                                printed = 1;
+                        }
+                        pm_idle = mwait_idle;
+                }
+        }
+}
+static int __init idle_setup (char *str)
+{
+        if (!strncmp(str, "poll", 4)) {
+                printk("using polling idle threads.\n");
+                pm_idle = poll_idle;
+        }
+        boot_option_idle_override = 1;
+        return 1;
+}
+__setup("idle=", idle_setup);
+/* Prints also some state that isn't saved in the pt_regs */ 
+void __show_regs(struct pt_regs * regs)
+{
+        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+        unsigned int fsindex,gsindex;
+        unsigned int ds,cs,es; 
+        printk("\n");
+        print_modules();
+        printk("Pid: %d, comm: %.20s %s %s\n", 
+               current->pid, current->comm, print_tainted(), system_utsname.release);
+        printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
+        printk_address(regs->rip); 
+        printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
+        printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+               regs->rax, regs->rbx, regs->rcx);
+        printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
+               regs->rdx, regs->rsi, regs->rdi); 
+        printk("RBP: %016lx R08: %016lx R09: %016lx\n",
+               regs->rbp, regs->r8, regs->r9); 
+        printk("R10: %016lx R11: %016lx R12: %016lx\n",
+               regs->r10, regs->r11, regs->r12); 
+        printk("R13: %016lx R14: %016lx R15: %016lx\n",
+               regs->r13, regs->r14, regs->r15); 
+        asm("movl %%ds,%0" : "=r" (ds)); 
+        asm("movl %%cs,%0" : "=r" (cs)); 
+        asm("movl %%es,%0" : "=r" (es)); 
+        asm("movl %%fs,%0" : "=r" (fsindex));
+        asm("movl %%gs,%0" : "=r" (gsindex));
+        rdmsrl(MSR_FS_BASE, fs);
+        rdmsrl(MSR_GS_BASE, gs); 
+        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 
+        asm("movq %%cr0, %0": "=r" (cr0));
+        asm("movq %%cr2, %0": "=r" (cr2));
+        asm("movq %%cr3, %0": "=r" (cr3));
+        asm("movq %%cr4, %0": "=r" (cr4));
+        printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 
+               fs,fsindex,gs,gsindex,shadowgs); 
+        printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 
+        printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
+}
+void show_regs(struct pt_regs *regs)
+{
+        __show_regs(regs);
+        show_trace(&regs->rsp);
+}
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+        struct task_struct *me = current;
+        struct thread_struct *t = &me->thread;
+        if (me->thread.io_bitmap_ptr) { 
+                struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+                kfree(t->io_bitmap_ptr);
+                t->io_bitmap_ptr = NULL;
+                /*
+                 * Careful, clear this in the TSS too:
+                 */
+                memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
+                t->io_bitmap_max = 0;
+                put_cpu();
+        }
+}
+void flush_thread(void)
+{
+        struct task_struct *tsk = current;
+        struct thread_info *t = current_thread_info();
+        if (t->flags & _TIF_ABI_PENDING)
+                t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
+        tsk->thread.debugreg0 = 0;
+        tsk->thread.debugreg1 = 0;
+        tsk->thread.debugreg2 = 0;
+        tsk->thread.debugreg3 = 0;
+        tsk->thread.debugreg6 = 0;
+        tsk->thread.debugreg7 = 0;
+        memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
+        /*
+         * Forget coprocessor state..
+         */
+        clear_fpu(tsk);
+        clear_used_math();
+}
+void release_thread(struct task_struct *dead_task)
+{
+        if (dead_task->mm) {
+                if (dead_task->mm->context.size) {
+                        printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
+                                        dead_task->comm,
+                                        dead_task->mm->context.ldt,
+                                        dead_task->mm->context.size);
+                        BUG();
+                }
+        }
+}
+static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
+{
+        struct user_desc ud = { 
+                .base_addr = addr,
+                .limit = 0xfffff,
+                .seg_32bit = 1,
+                .limit_in_pages = 1,
+                .useable = 1,
+        };
+        struct n_desc_struct *desc = (void *)t->thread.tls_array;
+        desc += tls;
+        desc->a = LDT_entry_a(&ud); 
+        desc->b = LDT_entry_b(&ud); 
+}
+static inline u32 read_32bit_tls(struct task_struct *t, int tls)
+{
+        struct desc_struct *desc = (void *)t->thread.tls_array;
+        desc += tls;
+        return desc->base0 | 
+                (((u32)desc->base1) << 16) | 
+                (((u32)desc->base2) << 24);
+}
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+        unlazy_fpu(tsk);
+}
+int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 
+                unsigned long unused,
+        struct task_struct * p, struct pt_regs * regs)
+{
+        int err;
+        struct pt_regs * childregs;
+        struct task_struct *me = current;
+        childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
+        *childregs = *regs;
+        childregs->rax = 0;
+        childregs->rsp = rsp;
+        if (rsp == ~0UL) {
+                childregs->rsp = (unsigned long)childregs;
+        }
+        p->thread.rsp = (unsigned long) childregs;
+        p->thread.rsp0 = (unsigned long) (childregs+1);
+        p->thread.userrsp = me->thread.userrsp; 
+        set_ti_thread_flag(p->thread_info, TIF_FORK);
+        p->thread.fs = me->thread.fs;
+        p->thread.gs = me->thread.gs;
+        asm("movl %%gs,%0" : "=m" (p->thread.gsindex));
+        asm("movl %%fs,%0" : "=m" (p->thread.fsindex));
+        asm("movl %%es,%0" : "=m" (p->thread.es));
+        asm("movl %%ds,%0" : "=m" (p->thread.ds));
+        if (unlikely(me->thread.io_bitmap_ptr != NULL)) { 
+                p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+                if (!p->thread.io_bitmap_ptr) {
+                        p->thread.io_bitmap_max = 0;
+                        return -ENOMEM;
+                }
+                memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
+        } 
+        /*
+         * Set a new TLS for the child thread?
+         */
+        if (clone_flags & CLONE_SETTLS) {
+#ifdef CONFIG_IA32_EMULATION
+                if (test_thread_flag(TIF_IA32))
+                        err = ia32_child_tls(p, childregs); 
+                else                    
+#endif   
+                        err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
+                if (err) 
+                        goto out;
+        }
+        err = 0;
+out:
+        if (err && p->thread.io_bitmap_ptr) {
+                kfree(p->thread.io_bitmap_ptr);
+                p->thread.io_bitmap_max = 0;
+        }
+        return err;
+}
+/*
+ * This special macro can be used to load a debugging register
+ */
+#define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
+/*
+ *      switch_to(x,y) should switch tasks from x to y.
+ *
+ * This could still be optimized: 
+ * - fold all the options into a flag word and test it with a single test.
+ * - could test fs/gs bitsliced
+ */
+struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+        struct thread_struct *prev = &prev_p->thread,
+                                 *next = &next_p->thread;
+        int cpu = smp_processor_id();  
+        struct tss_struct *tss = &per_cpu(init_tss, cpu);
+        unlazy_fpu(prev_p);
+        /*
+         * Reload esp0, LDT and the page table pointer:
+         */
+        tss->rsp0 = next->rsp0;
+        /* 
+         * Switch DS and ES.
+         * This won't pick up thread selector changes, but I guess that is ok.
+         */
+        asm volatile("movl %%es,%0" : "=m" (prev->es)); 
+        if (unlikely(next->es | prev->es))
+                loadsegment(es, next->es); 
+        
+        asm volatile ("movl %%ds,%0" : "=m" (prev->ds)); 
+        if (unlikely(next->ds | prev->ds))
+                loadsegment(ds, next->ds);
+        load_TLS(next, cpu);
+        /* 
+         * Switch FS and GS.
+         */
+        { 
+                unsigned fsindex;
+                asm volatile("movl %%fs,%0" : "=r" (fsindex)); 
+                /* segment register != 0 always requires a reload. 
+                   also reload when it has changed. 
+                   when prev process used 64bit base always reload
+                   to avoid an information leak. */
+                if (unlikely(fsindex | next->fsindex | prev->fs)) {
+                        loadsegment(fs, next->fsindex);
+                        /* check if the user used a selector != 0
+                         * if yes clear 64bit base, since overloaded base
+                         * is always mapped to the Null selector
+                         */
+                        if (fsindex)
+                        prev->fs = 0;                           
+                }
+                /* when next process has a 64bit base use it */
+                if (next->fs) 
+                        wrmsrl(MSR_FS_BASE, next->fs); 
+                prev->fsindex = fsindex;
+        }
+        { 
+                unsigned gsindex;
+                asm volatile("movl %%gs,%0" : "=r" (gsindex)); 
+                if (unlikely(gsindex | next->gsindex | prev->gs)) {
+                        load_gs_index(next->gsindex);
+                        if (gsindex)
+                        prev->gs = 0;                           
+                }
+                if (next->gs)
+                        wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 
+                prev->gsindex = gsindex;
+        }
+        /* 
+         * Switch the PDA context.
+         */
+        prev->userrsp = read_pda(oldrsp); 
+        write_pda(oldrsp, next->userrsp); 
+        write_pda(pcurrent, next_p); 
+        write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
+        /*
+         * Now maybe reload the debug registers
+         */
+        if (unlikely(next->debugreg7)) {
+                loaddebug(next, 0);
+                loaddebug(next, 1);
+                loaddebug(next, 2);
+                loaddebug(next, 3);
+                /* no 4 and 5 */
+                loaddebug(next, 6);
+                loaddebug(next, 7);
+        }
+        /* 
+         * Handle the IO bitmap 
+         */ 
+        if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
+                if (next->io_bitmap_ptr)
+                        /*
+                         * Copy the relevant range of the IO bitmap.
+                         * Normally this is 128 bytes or less:
+                         */
+                        memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+                                max(prev->io_bitmap_max, next->io_bitmap_max));
+                else {
+                        /*
+                         * Clear any possible leftover bits:
+                         */
+                        memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+                }
+        }
+        return prev_p;
+}
+/*
+ * sys_execve() executes a new program.
+ */
+asmlinkage 
+long sys_execve(char __user *name, char __user * __user *argv,
+                char __user * __user *envp, struct pt_regs regs)
+{
+        long error;
+        char * filename;
+        filename = getname(name);
+        error = PTR_ERR(filename);
+        if (IS_ERR(filename)) 
+                return error;
+        error = do_execve(filename, argv, envp, &regs); 
+        if (error == 0) {
+                task_lock(current);
+                current->ptrace &= ~PT_DTRACE;
+                task_unlock(current);
+        }
+        putname(filename);
+        return error;
+}
+void set_personality_64bit(void)
+{
+        /* inherit personality from parent */
+        /* Make sure to be in 64bit mode */
+        clear_thread_flag(TIF_IA32); 
+        /* TBD: overwrites user setup. Should have two bits.
+           But 64bit processes have always behaved this way,
+           so it's not too bad. The main problem is just that
+           32bit childs are affected again. */
+        current->personality &= ~READ_IMPLIES_EXEC;
+}
+asmlinkage long sys_fork(struct pt_regs *regs)
+{
+        return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
+}
+asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+{
+        if (!newsp)
+                newsp = regs->rsp;
+        return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+}
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+asmlinkage long sys_vfork(struct pt_regs *regs)
+{
+        return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
+                    NULL, NULL);
+}
+unsigned long get_wchan(struct task_struct *p)
+{
+        unsigned long stack;
+        u64 fp,rip;
+        int count = 0;
+        if (!p || p == current || p->state==TASK_RUNNING)
+                return 0; 
+        stack = (unsigned long)p->thread_info; 
+        if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
+                return 0;
+        fp = *(u64 *)(p->thread.rsp);
+        do { 
+                if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
+                        return 0; 
+                rip = *(u64 *)(fp+8); 
+                if (!in_sched_functions(rip))
+                        return rip; 
+                fp = *(u64 *)fp; 
+        } while (count++ < 16); 
+        return 0;
+}
+long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
+{ 
+        int ret = 0; 
+        int doit = task == current;
+        int cpu;
+        switch (code) { 
+        case ARCH_SET_GS:
+                if (addr >= TASK_SIZE) 
+                        return -EPERM; 
+                cpu = get_cpu();
+                /* handle small bases via the GDT because that's faster to 
+                   switch. */
+                if (addr <= 0xffffffff) {  
+                        set_32bit_tls(task, GS_TLS, addr); 
+                        if (doit) { 
+                                load_TLS(&task->thread, cpu);
+                                load_gs_index(GS_TLS_SEL); 
+                        }
+                        task->thread.gsindex = GS_TLS_SEL; 
+                        task->thread.gs = 0;
+                } else { 
+                        task->thread.gsindex = 0;
+                        task->thread.gs = addr;
+                        if (doit) {
+                load_gs_index(0);
+                ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 
+                        } 
+                }
+                put_cpu();
+                break;
+        case ARCH_SET_FS:
+                /* Not strictly needed for fs, but do it for symmetry
+                   with gs */
+                if (addr >= TASK_SIZE)
+                        return -EPERM; 
+                cpu = get_cpu();
+                /* handle small bases via the GDT because that's faster to 
+                   switch. */
+                if (addr <= 0xffffffff) { 
+                        set_32bit_tls(task, FS_TLS, addr);
+                        if (doit) { 
+                                load_TLS(&task->thread, cpu); 
+                                asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
+                        }
+                        task->thread.fsindex = FS_TLS_SEL;
+                        task->thread.fs = 0;
+                } else { 
+                        task->thread.fsindex = 0;
+                        task->thread.fs = addr;
+                        if (doit) {
+                                /* set the selector to 0 to not confuse
+                                   __switch_to */
+                asm volatile("movl %0,%%fs" :: "r" (0));
+                ret = checking_wrmsrl(MSR_FS_BASE, addr); 
+                        }
+                }
+                put_cpu();
+                break;
+        case ARCH_GET_FS: { 
+                unsigned long base; 
+                if (task->thread.fsindex == FS_TLS_SEL)
+                        base = read_32bit_tls(task, FS_TLS);
+                else if (doit) {
+                        rdmsrl(MSR_FS_BASE, base);
+                } else
+                        base = task->thread.fs;
+                ret = put_user(base, (unsigned long __user *)addr); 
+                break; 
+        }
+        case ARCH_GET_GS: { 
+                unsigned long base;
+                if (task->thread.gsindex == GS_TLS_SEL)
+                        base = read_32bit_tls(task, GS_TLS);
+                else if (doit) {
+                        rdmsrl(MSR_KERNEL_GS_BASE, base);
+                } else
+                        base = task->thread.gs;
+                ret = put_user(base, (unsigned long __user *)addr); 
+                break;
+        }
+        default:
+                ret = -EINVAL;
+                break;
+        } 
+        return ret;     
+} 
+long sys_arch_prctl(int code, unsigned long addr)
+{
+        return do_arch_prctl(current, code, addr);
+} 
+/* 
+ * Capture the user space registers if the task is not running (in user space)
+ */
+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
+{
+        struct pt_regs *pp, ptregs;
+        pp = (struct pt_regs *)(tsk->thread.rsp0);
+        --pp; 
+        ptregs = *pp; 
+        ptregs.cs &= 0xffff;
+        ptregs.ss &= 0xffff;
+        elf_core_copy_regs(regs, &ptregs);
+ 
+        return 1;
+}
+unsigned long arch_align_stack(unsigned long sp)
+{
+        if (randomize_va_space)
+                sp -= get_random_int() % 8192;
+        return sp & ~0xf;
+}
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
new file mode 100644
index 000000000000..0b7b101debdf
--- /dev/null
+++ b/arch/x86_64/kernel/ptrace.c
@@ -0,0 +1,547 @@
+/* ptrace.c */
+/* By Ross Biro 1/23/92 */
+/*
+ * Pentium III FXSR, SSE support
+ *      Gareth Hughes <gareth@valinux.com>, May 2000
+ * 
+ * x86-64 port 2000-2002 Andi Kleen
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/security.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/ia32.h>
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+/* determines which flags the user has access to. */
+/* 1 = access 0 = no access */
+#define FLAG_MASK 0x44dd5UL
+/* set's the trap flag. */
+#define TRAP_FLAG 0x100UL
+/*
+ * eflags and offset of eflags on child stack..
+ */
+#define EFLAGS offsetof(struct pt_regs, eflags)
+#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
+/*
+ * this routine will get a word off of the processes privileged stack. 
+ * the offset is how far from the base addr as stored in the TSS.  
+ * this routine assumes that all the privileged stacks are in our
+ * data space.
+ */   
+static inline unsigned long get_stack_long(struct task_struct *task, int offset)
+{
+        unsigned char *stack;
+        stack = (unsigned char *)task->thread.rsp0;
+        stack += offset;
+        return (*((unsigned long *)stack));
+}
+/*
+ * this routine will put a word on the processes privileged stack. 
+ * the offset is how far from the base addr as stored in the TSS.  
+ * this routine assumes that all the privileged stacks are in our
+ * data space.
+ */
+static inline long put_stack_long(struct task_struct *task, int offset,
+        unsigned long data)
+{
+        unsigned char * stack;
+        stack = (unsigned char *) task->thread.rsp0;
+        stack += offset;
+        *(unsigned long *) stack = data;
+        return 0;
+}
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure the single step bit is not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{ 
+        long tmp;
+        clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+        tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG;
+        put_stack_long(child, EFL_OFFSET, tmp);
+}
+static int putreg(struct task_struct *child,
+        unsigned long regno, unsigned long value)
+{
+        unsigned long tmp; 
+        
+        /* Some code in the 64bit emulation may not be 64bit clean.
+           Don't take any chances. */
+        if (test_tsk_thread_flag(child, TIF_IA32))
+                value &= 0xffffffff;
+        switch (regno) {
+                case offsetof(struct user_regs_struct,fs):
+                        if (value && (value & 3) != 3)
+                                return -EIO;
+                        child->thread.fsindex = value & 0xffff; 
+                        return 0;
+                case offsetof(struct user_regs_struct,gs):
+                        if (value && (value & 3) != 3)
+                                return -EIO;
+                        child->thread.gsindex = value & 0xffff;
+                        return 0;
+                case offsetof(struct user_regs_struct,ds):
+                        if (value && (value & 3) != 3)
+                                return -EIO;
+                        child->thread.ds = value & 0xffff;
+                        return 0;
+                case offsetof(struct user_regs_struct,es): 
+                        if (value && (value & 3) != 3)
+                                return -EIO;
+                        child->thread.es = value & 0xffff;
+                        return 0;
+                case offsetof(struct user_regs_struct,ss):
+                        if ((value & 3) != 3)
+                                return -EIO;
+                        value &= 0xffff;
+                        return 0;
+                case offsetof(struct user_regs_struct,fs_base):
+                        if (!((value >> 48) == 0 || (value >> 48) == 0xffff))
+                                return -EIO; 
+                        child->thread.fs = value;
+                        return 0;
+                case offsetof(struct user_regs_struct,gs_base):
+                        if (!((value >> 48) == 0 || (value >> 48) == 0xffff))
+                                return -EIO; 
+                        child->thread.gs = value;
+                        return 0;
+                case offsetof(struct user_regs_struct, eflags):
+                        value &= FLAG_MASK;
+                        tmp = get_stack_long(child, EFL_OFFSET); 
+                        tmp &= ~FLAG_MASK; 
+                        value |= tmp;
+                        break;
+                case offsetof(struct user_regs_struct,cs): 
+                        if ((value & 3) != 3)
+                                return -EIO;
+                        value &= 0xffff;
+                        break;
+        }
+        put_stack_long(child, regno - sizeof(struct pt_regs), value);
+        return 0;
+}
+static unsigned long getreg(struct task_struct *child, unsigned long regno)
+{
+        unsigned long val;
+        switch (regno) {
+                case offsetof(struct user_regs_struct, fs):
+                        return child->thread.fsindex;
+                case offsetof(struct user_regs_struct, gs):
+                        return child->thread.gsindex;
+                case offsetof(struct user_regs_struct, ds):
+                        return child->thread.ds;
+                case offsetof(struct user_regs_struct, es):
+                        return child->thread.es; 
+                case offsetof(struct user_regs_struct, fs_base):
+                        return child->thread.fs;
+                case offsetof(struct user_regs_struct, gs_base):
+                        return child->thread.gs;
+                default:
+                        regno = regno - sizeof(struct pt_regs);
+                        val = get_stack_long(child, regno);
+                        if (test_tsk_thread_flag(child, TIF_IA32))
+                                val &= 0xffffffff;
+                        return val;
+        }
+}
+asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data)
+{
+        struct task_struct *child;
+        long i, ret;
+        unsigned ui;
+        /* This lock_kernel fixes a subtle race with suid exec */
+        lock_kernel();
+        ret = -EPERM;
+        if (request == PTRACE_TRACEME) {
+                /* are we already being traced? */
+                if (current->ptrace & PT_PTRACED)
+                        goto out;
+                ret = security_ptrace(current->parent, current);
+                if (ret)
+                        goto out;
+                /* set the ptrace bit in the process flags. */
+                current->ptrace |= PT_PTRACED;
+                ret = 0;
+                goto out;
+        }
+        ret = -ESRCH;
+        read_lock(&tasklist_lock);
+        child = find_task_by_pid(pid);
+        if (child)
+                get_task_struct(child);
+        read_unlock(&tasklist_lock);
+        if (!child)
+                goto out;
+        ret = -EPERM;
+        if (pid == 1)           /* you may not mess with init */
+                goto out_tsk;
+        if (request == PTRACE_ATTACH) {
+                ret = ptrace_attach(child);
+                goto out_tsk;
+        }
+        ret = ptrace_check_attach(child, request == PTRACE_KILL); 
+        if (ret < 0) 
+                goto out_tsk;
+        switch (request) {
+        /* when I and D space are separate, these will need to be fixed. */
+        case PTRACE_PEEKTEXT: /* read word at location addr. */ 
+        case PTRACE_PEEKDATA: {
+                unsigned long tmp;
+                int copied;
+                copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
+                ret = -EIO;
+                if (copied != sizeof(tmp))
+                        break;
+                ret = put_user(tmp,(unsigned long __user *) data);
+                break;
+        }
+        /* read the word at location addr in the USER area. */
+        case PTRACE_PEEKUSR: {
+                unsigned long tmp;
+                ret = -EIO;
+                if ((addr & 7) ||
+                    addr > sizeof(struct user) - 7)
+                        break;
+                switch (addr) { 
+                case 0 ... sizeof(struct user_regs_struct):
+                        tmp = getreg(child, addr);
+                        break;
+                case offsetof(struct user, u_debugreg[0]):
+                        tmp = child->thread.debugreg0;
+                        break;
+                case offsetof(struct user, u_debugreg[1]):
+                        tmp = child->thread.debugreg1;
+                        break;
+                case offsetof(struct user, u_debugreg[2]):
+                        tmp = child->thread.debugreg2;
+                        break;
+                case offsetof(struct user, u_debugreg[3]):
+                        tmp = child->thread.debugreg3;
+                        break;
+                case offsetof(struct user, u_debugreg[6]):
+                        tmp = child->thread.debugreg6;
+                        break;
+                case offsetof(struct user, u_debugreg[7]):
+                        tmp = child->thread.debugreg7;
+                        break;
+                default:
+                        tmp = 0;
+                        break;
+                }
+                ret = put_user(tmp,(unsigned long __user *) data);
+                break;
+        }
+        /* when I and D space are separate, this will have to be fixed. */
+        case PTRACE_POKETEXT: /* write the word at location addr. */
+        case PTRACE_POKEDATA:
+                ret = 0;
+                if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
+                        break;
+                ret = -EIO;
+                break;
+        case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
+                ret = -EIO;
+                if ((addr & 7) ||
+                    addr > sizeof(struct user) - 7)
+                        break;
+                switch (addr) { 
+                case 0 ... sizeof(struct user_regs_struct): 
+                        ret = putreg(child, addr, data);
+                        break;
+                /* Disallows to set a breakpoint into the vsyscall */
+                case offsetof(struct user, u_debugreg[0]):
+                        if (data >= TASK_SIZE-7) break;
+                        child->thread.debugreg0 = data;
+                        ret = 0;
+                        break;
+                case offsetof(struct user, u_debugreg[1]):
+                        if (data >= TASK_SIZE-7) break;
+                        child->thread.debugreg1 = data;
+                        ret = 0;
+                        break;
+                case offsetof(struct user, u_debugreg[2]):
+                        if (data >= TASK_SIZE-7) break;
+                        child->thread.debugreg2 = data;
+                        ret = 0;
+                        break;
+                case offsetof(struct user, u_debugreg[3]):
+                        if (data >= TASK_SIZE-7) break;
+                        child->thread.debugreg3 = data;
+                        ret = 0;
+                        break;
+                case offsetof(struct user, u_debugreg[6]):
+                                  if (data >> 32)
+                                break; 
+                        child->thread.debugreg6 = data;
+                        ret = 0;
+                        break;
+                case offsetof(struct user, u_debugreg[7]):
+                        /* See arch/i386/kernel/ptrace.c for an explanation of
+                         * this awkward check.*/
+                                  data &= ~DR_CONTROL_RESERVED;
+                                  for(i=0; i<4; i++)
+                                          if ((0x5454 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
+                                        break;
+                        if (i == 4) {
+                                child->thread.debugreg7 = data;
+                          ret = 0;
+                  }
+                  break;
+                }
+                break;
+        case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
+        case PTRACE_CONT: { /* restart after signal. */
+                long tmp;
+                ret = -EIO;
+                if ((unsigned long) data > _NSIG)
+                        break;
+                if (request == PTRACE_SYSCALL)
+                        set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
+                else
+                        clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
+                clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+                child->exit_code = data;
+        /* make sure the single step bit is not set. */
+                tmp = get_stack_long(child, EFL_OFFSET);
+                tmp &= ~TRAP_FLAG;
+                put_stack_long(child, EFL_OFFSET,tmp);
+                wake_up_process(child);
+                ret = 0;
+                break;
+        }
+#ifdef CONFIG_IA32_EMULATION
+                /* This makes only sense with 32bit programs. Allow a
+                   64bit debugger to fully examine them too. Better
+                   don't use it against 64bit processes, use
+                   PTRACE_ARCH_PRCTL instead. */
+        case PTRACE_SET_THREAD_AREA: {
+                struct user_desc __user *p;
+                int old; 
+                p = (struct user_desc __user *)data;
+                get_user(old,  &p->entry_number); 
+                put_user(addr, &p->entry_number);
+                ret = do_set_thread_area(&child->thread, p);
+                put_user(old,  &p->entry_number); 
+                break;
+        case PTRACE_GET_THREAD_AREA:
+                p = (struct user_desc __user *)data;
+                get_user(old,  &p->entry_number); 
+                put_user(addr, &p->entry_number);
+                ret = do_get_thread_area(&child->thread, p);
+                put_user(old,  &p->entry_number); 
+                break;
+        } 
+#endif
+                /* normal 64bit interface to access TLS data. 
+                   Works just like arch_prctl, except that the arguments
+                   are reversed. */
+        case PTRACE_ARCH_PRCTL: 
+                ret = do_arch_prctl(child, data, addr);
+                break;
+/*
+ * make the child exit.  Best I can do is send it a sigkill. 
+ * perhaps it should be put in the status that it wants to 
+ * exit.
+ */
+        case PTRACE_KILL: {
+                long tmp;
+                ret = 0;
+                if (child->exit_state == EXIT_ZOMBIE)   /* already dead */
+                        break;
+                clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+                child->exit_code = SIGKILL;
+                /* make sure the single step bit is not set. */
+                tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG;
+                put_stack_long(child, EFL_OFFSET, tmp);
+                wake_up_process(child);
+                break;
+        }
+        case PTRACE_SINGLESTEP: {  /* set the trap flag. */
+                long tmp;
+                ret = -EIO;
+                if ((unsigned long) data > _NSIG)
+                        break;
+                clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
+                if ((child->ptrace & PT_DTRACE) == 0) {
+                        /* Spurious delayed TF traps may occur */
+                        child->ptrace |= PT_DTRACE;
+                }
+                tmp = get_stack_long(child, EFL_OFFSET) | TRAP_FLAG;
+                put_stack_long(child, EFL_OFFSET, tmp);
+                set_tsk_thread_flag(child, TIF_SINGLESTEP);
+                child->exit_code = data;
+                /* give it a chance to run. */
+                wake_up_process(child);
+                ret = 0;
+                break;
+        }
+        case PTRACE_DETACH:
+                /* detach a process that was attached. */
+                ret = ptrace_detach(child, data);
+                break;
+        case PTRACE_GETREGS: { /* Get all gp regs from the child. */
+                if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
+                               sizeof(struct user_regs_struct))) {
+                        ret = -EIO;
+                        break;
+                }
+                ret = 0;
+                for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
+                        ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
+                        data += sizeof(long);
+                }
+                break;
+        }
+        case PTRACE_SETREGS: { /* Set all gp regs in the child. */
+                unsigned long tmp;
+                if (!access_ok(VERIFY_READ, (unsigned __user *)data,
+                               sizeof(struct user_regs_struct))) {
+                        ret = -EIO;
+                        break;
+                }
+                ret = 0;
+                for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
+                        ret |= __get_user(tmp, (unsigned long __user *) data);
+                        putreg(child, ui, tmp);
+                        data += sizeof(long);
+                }
+                break;
+        }
+        case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
+                if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
+                               sizeof(struct user_i387_struct))) {
+                        ret = -EIO;
+                        break;
+                }
+                ret = get_fpregs((struct user_i387_struct __user *)data, child);
+                break;
+        }
+        case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
+                if (!access_ok(VERIFY_READ, (unsigned __user *)data,
+                               sizeof(struct user_i387_struct))) {
+                        ret = -EIO;
+                        break;
+                }
+                set_stopped_child_used_math(child);
+                ret = set_fpregs(child, (struct user_i387_struct __user *)data);
+                break;
+        }
+        default:
+                ret = ptrace_request(child, request, addr, data);
+                break;
+        }
+out_tsk:
+        put_task_struct(child);
+out:
+        unlock_kernel();
+        return ret;
+}
+static void syscall_trace(struct pt_regs *regs)
+{
+#if 0
+        printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
+               current->comm,
+               regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
+               current_thread_info()->flags, current->ptrace); 
+#endif
+        ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
+                                ? 0x80 : 0));
+        /*
+         * this isn't the same as continuing with a signal, but it will do
+         * for normal use.  strace only continues with a signal if the
+         * stopping signal is not SIGTRAP.  -brl
+         */
+        if (current->exit_code) {
+                send_sig(current->exit_code, current, 1);
+                current->exit_code = 0;
+        }
+}
+asmlinkage void syscall_trace_enter(struct pt_regs *regs)
+{
+        /* do the secure computing check first */
+        secure_computing(regs->orig_rax);
+        if (unlikely(current->audit_context))
+                audit_syscall_entry(current, regs->orig_rax,
+                                    regs->rdi, regs->rsi,
+                                    regs->rdx, regs->r10);
+        if (test_thread_flag(TIF_SYSCALL_TRACE)
+            && (current->ptrace & PT_PTRACED))
+                syscall_trace(regs);
+}
+asmlinkage void syscall_trace_leave(struct pt_regs *regs)
+{
+        if (unlikely(current->audit_context))
+                audit_syscall_exit(current, regs->rax);
+        if ((test_thread_flag(TIF_SYSCALL_TRACE)
+             || test_thread_flag(TIF_SINGLESTEP))
+            && (current->ptrace & PT_PTRACED))
+                syscall_trace(regs);
+}
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
new file mode 100644
index 000000000000..be4b36f762cf
--- /dev/null
+++ b/arch/x86_64/kernel/reboot.c
@@ -0,0 +1,163 @@
+/* Various gunk just to reboot the machine. */ 
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <asm/io.h>
+#include <asm/kdebug.h>
+#include <asm/delay.h>
+#include <asm/hw_irq.h>
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/apic.h>
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+static long no_idt[3];
+static enum { 
+        BOOT_TRIPLE = 't',
+        BOOT_KBD = 'k'
+} reboot_type = BOOT_KBD;
+static int reboot_mode = 0;
+int reboot_force;
+/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
+   warm   Don't set the cold reboot flag
+   cold   Set the cold reboot flag
+   triple Force a triple fault (init)
+   kbd    Use the keyboard controller. cold reset (default)
+   force  Avoid anything that could hang.
+ */ 
+static int __init reboot_setup(char *str)
+{
+        for (;;) {
+                switch (*str) {
+                case 'w': 
+                        reboot_mode = 0x1234;
+                        break;
+                case 'c':
+                        reboot_mode = 0;
+                        break;
+                case 't':
+                case 'b':
+                case 'k':
+                        reboot_type = *str;
+                        break;
+                case 'f':
+                        reboot_force = 1;
+                        break;
+                }
+                if((str = strchr(str,',')) != NULL)
+                        str++;
+                else
+                        break;
+        }
+        return 1;
+}
+__setup("reboot=", reboot_setup);
+#ifdef CONFIG_SMP
+static void smp_halt(void)
+{
+        int cpuid = safe_smp_processor_id(); 
+        static int first_entry = 1;
+        if (reboot_force)
+                return;
+        if (first_entry) {
+                first_entry = 0;
+                smp_call_function((void *)machine_restart, NULL, 1, 0);
+        }
+                        
+        smp_stop_cpu(); 
+        /* AP calling this. Just halt */
+        if (cpuid != boot_cpu_id) { 
+                for (;;) 
+                        asm("hlt");
+        }
+        /* Wait for all other CPUs to have run smp_stop_cpu */
+        while (!cpus_empty(cpu_online_map))
+                rep_nop(); 
+}
+#endif
+static inline void kb_wait(void)
+{
+        int i;
+        for (i=0; i<0x10000; i++)
+                if ((inb_p(0x64) & 0x02) == 0)
+                        break;
+}
+void machine_restart(char * __unused)
+{
+        int i;
+        printk("machine restart\n");
+#ifdef CONFIG_SMP
+        smp_halt(); 
+#endif
+        if (!reboot_force) {
+                local_irq_disable();
+#ifndef CONFIG_SMP
+                disable_local_APIC();
+#endif
+                disable_IO_APIC();
+                local_irq_enable();
+        }
+        
+        /* Tell the BIOS if we want cold or warm reboot */
+        *((unsigned short *)__va(0x472)) = reboot_mode;
+       
+        for (;;) {
+                /* Could also try the reset bit in the Hammer NB */
+                switch (reboot_type) { 
+                case BOOT_KBD:
+                for (i=0; i<100; i++) {
+                        kb_wait();
+                        udelay(50);
+                        outb(0xfe,0x64);         /* pulse reset low */
+                        udelay(50);
+                }
+                case BOOT_TRIPLE: 
+                        __asm__ __volatile__("lidt (%0)": :"r" (&no_idt));
+                        __asm__ __volatile__("int3");
+                        reboot_type = BOOT_KBD;
+                        break;
+                }      
+        }      
+}
+EXPORT_SYMBOL(machine_restart);
+void machine_halt(void)
+{
+}
+EXPORT_SYMBOL(machine_halt);
+void machine_power_off(void)
+{
+        if (pm_power_off)
+                pm_power_off();
+}
+EXPORT_SYMBOL(machine_power_off);
diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c
new file mode 100644
index 000000000000..48f7c18172b9
--- /dev/null
+++ b/arch/x86_64/kernel/semaphore.c
@@ -0,0 +1,180 @@
+/*
+ * x86_64 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <asm/errno.h>
+#include <asm/semaphore.h>
+/*
+ * Semaphores are implemented using a two-way counter:
+ * The "count" variable is decremented for each process
+ * that tries to acquire the semaphore, while the "sleeping"
+ * variable is a count of such acquires.
+ *
+ * Notably, the inline "up()" and "down()" functions can
+ * efficiently test if they need to do any extra work (up
+ * needs to do something only if count was negative before
+ * the increment operation.
+ *
+ * "sleeping" and the contention routine ordering is protected
+ * by the spinlock in the semaphore's waitqueue head.
+ *
+ * Note that these functions are only called when there is
+ * contention on the lock, and as such all this is the
+ * "non-critical" part of the whole semaphore business. The
+ * critical part is the inline stuff in <asm/semaphore.h>
+ * where we want to avoid any extra jumps and calls.
+ */
+/*
+ * Logic:
+ *  - only on a boundary condition do we need to care. When we go
+ *    from a negative count to a non-negative, we wake people up.
+ *  - when we go from a non-negative count to a negative do we
+ *    (a) synchronize with the "sleeper" count and (b) make sure
+ *    that we're on the wakeup list before we synchronize so that
+ *    we cannot lose wakeup events.
+ */
+void __up(struct semaphore *sem)
+{
+        wake_up(&sem->wait);
+}
+void __sched __down(struct semaphore * sem)
+{
+        struct task_struct *tsk = current;
+        DECLARE_WAITQUEUE(wait, tsk);
+        unsigned long flags;
+        tsk->state = TASK_UNINTERRUPTIBLE;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        add_wait_queue_exclusive_locked(&sem->wait, &wait);
+        sem->sleepers++;
+        for (;;) {
+                int sleepers = sem->sleepers;
+                /*
+                 * Add "everybody else" into it. They aren't
+                 * playing, because we own the spinlock in
+                 * the wait_queue_head.
+                 */
+                if (!atomic_add_negative(sleepers - 1, &sem->count)) {
+                        sem->sleepers = 0;
+                        break;
+                }
+                sem->sleepers = 1;      /* us - see -1 above */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                schedule();
+                spin_lock_irqsave(&sem->wait.lock, flags);
+                tsk->state = TASK_UNINTERRUPTIBLE;
+        }
+        remove_wait_queue_locked(&sem->wait, &wait);
+        wake_up_locked(&sem->wait);
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        tsk->state = TASK_RUNNING;
+}
+int __sched __down_interruptible(struct semaphore * sem)
+{
+        int retval = 0;
+        struct task_struct *tsk = current;
+        DECLARE_WAITQUEUE(wait, tsk);
+        unsigned long flags;
+        tsk->state = TASK_INTERRUPTIBLE;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        add_wait_queue_exclusive_locked(&sem->wait, &wait);
+        sem->sleepers++;
+        for (;;) {
+                int sleepers = sem->sleepers;
+                /*
+                 * With signals pending, this turns into
+                 * the trylock failure case - we won't be
+                 * sleeping, and we* can't get the lock as
+                 * it has contention. Just correct the count
+                 * and exit.
+                 */
+                if (signal_pending(current)) {
+                        retval = -EINTR;
+                        sem->sleepers = 0;
+                        atomic_add(sleepers, &sem->count);
+                        break;
+                }
+                /*
+                 * Add "everybody else" into it. They aren't
+                 * playing, because we own the spinlock in
+                 * wait_queue_head. The "-1" is because we're
+                 * still hoping to get the semaphore.
+                 */
+                if (!atomic_add_negative(sleepers - 1, &sem->count)) {
+                        sem->sleepers = 0;
+                        break;
+                }
+                sem->sleepers = 1;      /* us - see -1 above */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                schedule();
+                spin_lock_irqsave(&sem->wait.lock, flags);
+                tsk->state = TASK_INTERRUPTIBLE;
+        }
+        remove_wait_queue_locked(&sem->wait, &wait);
+        wake_up_locked(&sem->wait);
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        tsk->state = TASK_RUNNING;
+        return retval;
+}
+/*
+ * Trylock failed - make sure we correct for
+ * having decremented the count.
+ *
+ * We could have done the trylock with a
+ * single "cmpxchg" without failure cases,
+ * but then it wouldn't work on a 386.
+ */
+int __down_trylock(struct semaphore * sem)
+{
+        int sleepers;
+        unsigned long flags;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        sleepers = sem->sleepers + 1;
+        sem->sleepers = 0;
+        /*
+         * Add "everybody else" and us into it. They aren't
+         * playing, because we own the spinlock in the
+         * wait_queue_head.
+         */
+        if (!atomic_add_negative(sleepers, &sem->count)) {
+                wake_up_locked(&sem->wait);
+        }
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        return 1;
+}
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
new file mode 100644
index 000000000000..e50fc41de6b9
--- /dev/null
+++ b/arch/x86_64/kernel/setup.c
@@ -0,0 +1,1189 @@
+/*
+ *  linux/arch/x86-64/kernel/setup.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Nov 2001 Dave Jones <davej@suse.de>
+ *  Forked from i386 setup code.
+ *
+ *  $Id$
+ */
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/tty.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <linux/console.h>
+#include <linux/seq_file.h>
+#include <linux/root_dev.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/kallsyms.h>
+#include <linux/edd.h>
+#include <asm/mtrr.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/msr.h>
+#include <asm/desc.h>
+#include <video/edid.h>
+#include <asm/e820.h>
+#include <asm/dma.h>
+#include <asm/mpspec.h>
+#include <asm/mmu_context.h>
+#include <asm/bootsetup.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/mach_apic.h>
+#include <asm/numa.h>
+/*
+ * Machine setup..
+ */
+struct cpuinfo_x86 boot_cpu_data;
+unsigned long mmu_cr4_features;
+int acpi_disabled;
+EXPORT_SYMBOL(acpi_disabled);
+#ifdef  CONFIG_ACPI_BOOT
+extern int __initdata acpi_ht;
+extern acpi_interrupt_flags     acpi_sci_flags;
+int __initdata acpi_force = 0;
+#endif
+int acpi_numa __initdata;
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0x10000000;
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+unsigned long saved_video_mode;
+#ifdef CONFIG_SWIOTLB
+int swiotlb;
+EXPORT_SYMBOL(swiotlb);
+#endif
+/*
+ * Setup options
+ */
+struct drive_info_struct { char dummy[32]; } drive_info;
+struct screen_info screen_info;
+struct sys_desc_table_struct {
+        unsigned short length;
+        unsigned char table[0];
+};
+struct edid_info edid_info;
+struct e820map e820;
+extern int root_mountflags;
+extern char _text, _etext, _edata, _end;
+char command_line[COMMAND_LINE_SIZE];
+struct resource standard_io_resources[] = {
+        { .name = "dma1", .start = 0x00, .end = 0x1f,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "pic1", .start = 0x20, .end = 0x21,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "timer0", .start = 0x40, .end = 0x43,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "timer1", .start = 0x50, .end = 0x53,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "keyboard", .start = 0x60, .end = 0x6f,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "dma page reg", .start = 0x80, .end = 0x8f,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "pic2", .start = 0xa0, .end = 0xa1,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "dma2", .start = 0xc0, .end = 0xdf,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+        { .name = "fpu", .start = 0xf0, .end = 0xff,
+                .flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+#define STANDARD_IO_RESOURCES \
+        (sizeof standard_io_resources / sizeof standard_io_resources[0])
+#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
+struct resource data_resource = {
+        .name = "Kernel data",
+        .start = 0,
+        .end = 0,
+        .flags = IORESOURCE_RAM,
+};
+struct resource code_resource = {
+        .name = "Kernel code",
+        .start = 0,
+        .end = 0,
+        .flags = IORESOURCE_RAM,
+};
+#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
+static struct resource system_rom_resource = {
+        .name = "System ROM",
+        .start = 0xf0000,
+        .end = 0xfffff,
+        .flags = IORESOURCE_ROM,
+};
+static struct resource extension_rom_resource = {
+        .name = "Extension ROM",
+        .start = 0xe0000,
+        .end = 0xeffff,
+        .flags = IORESOURCE_ROM,
+};
+static struct resource adapter_rom_resources[] = {
+        { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
+                .flags = IORESOURCE_ROM },
+        { .name = "Adapter ROM", .start = 0, .end = 0,
+                .flags = IORESOURCE_ROM },
+        { .name = "Adapter ROM", .start = 0, .end = 0,
+                .flags = IORESOURCE_ROM },
+        { .name = "Adapter ROM", .start = 0, .end = 0,
+                .flags = IORESOURCE_ROM },
+        { .name = "Adapter ROM", .start = 0, .end = 0,
+                .flags = IORESOURCE_ROM },
+        { .name = "Adapter ROM", .start = 0, .end = 0,
+                .flags = IORESOURCE_ROM }
+};
+#define ADAPTER_ROM_RESOURCES \
+        (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
+static struct resource video_rom_resource = {
+        .name = "Video ROM",
+        .start = 0xc0000,
+        .end = 0xc7fff,
+        .flags = IORESOURCE_ROM,
+};
+static struct resource video_ram_resource = {
+        .name = "Video RAM area",
+        .start = 0xa0000,
+        .end = 0xbffff,
+        .flags = IORESOURCE_RAM,
+};
+#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
+static int __init romchecksum(unsigned char *rom, unsigned long length)
+{
+        unsigned char *p, sum = 0;
+        for (p = rom; p < rom + length; p++)
+                sum += *p;
+        return sum == 0;
+}
+static void __init probe_roms(void)
+{
+        unsigned long start, length, upper;
+        unsigned char *rom;
+        int           i;
+        /* video rom */
+        upper = adapter_rom_resources[0].start;
+        for (start = video_rom_resource.start; start < upper; start += 2048) {
+                rom = isa_bus_to_virt(start);
+                if (!romsignature(rom))
+                        continue;
+                video_rom_resource.start = start;
+                /* 0 < length <= 0x7f * 512, historically */
+                length = rom[2] * 512;
+                /* if checksum okay, trust length byte */
+                if (length && romchecksum(rom, length))
+                        video_rom_resource.end = start + length - 1;
+                request_resource(&iomem_resource, &video_rom_resource);
+                break;
+                        }
+        start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+        if (start < upper)
+                start = upper;
+        /* system rom */
+        request_resource(&iomem_resource, &system_rom_resource);
+        upper = system_rom_resource.start;
+        /* check for extension rom (ignore length byte!) */
+        rom = isa_bus_to_virt(extension_rom_resource.start);
+        if (romsignature(rom)) {
+                length = extension_rom_resource.end - extension_rom_resource.start + 1;
+                if (romchecksum(rom, length)) {
+                        request_resource(&iomem_resource, &extension_rom_resource);
+                        upper = extension_rom_resource.start;
+                }
+        }
+        /* check for adapter roms on 2k boundaries */
+        for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
+                rom = isa_bus_to_virt(start);
+                if (!romsignature(rom))
+                        continue;
+                /* 0 < length <= 0x7f * 512, historically */
+                length = rom[2] * 512;
+                /* but accept any length that fits if checksum okay */
+                if (!length || start + length > upper || !romchecksum(rom, length))
+                        continue;
+                adapter_rom_resources[i].start = start;
+                adapter_rom_resources[i].end = start + length - 1;
+                request_resource(&iomem_resource, &adapter_rom_resources[i]);
+                start = adapter_rom_resources[i++].end & ~2047UL;
+        }
+}
+static __init void parse_cmdline_early (char ** cmdline_p)
+{
+        char c = ' ', *to = command_line, *from = COMMAND_LINE;
+        int len = 0;
+        /* Save unparsed command line copy for /proc/cmdline */
+        memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE);
+        saved_command_line[COMMAND_LINE_SIZE-1] = '\0';
+        for (;;) {
+                if (c != ' ') 
+                        goto next_char; 
+#ifdef  CONFIG_SMP
+                /*
+                 * If the BIOS enumerates physical processors before logical,
+                 * maxcpus=N at enumeration-time can be used to disable HT.
+                 */
+                else if (!memcmp(from, "maxcpus=", 8)) {
+                        extern unsigned int maxcpus;
+                        maxcpus = simple_strtoul(from + 8, NULL, 0);
+                }
+#endif
+#ifdef CONFIG_ACPI_BOOT
+                /* "acpi=off" disables both ACPI table parsing and interpreter init */
+                if (!memcmp(from, "acpi=off", 8))
+                        disable_acpi();
+                if (!memcmp(from, "acpi=force", 10)) { 
+                        /* add later when we do DMI horrors: */
+                        acpi_force = 1;
+                        acpi_disabled = 0;
+                }
+                /* acpi=ht just means: do ACPI MADT parsing 
+                   at bootup, but don't enable the full ACPI interpreter */
+                if (!memcmp(from, "acpi=ht", 7)) { 
+                        if (!acpi_force)
+                                disable_acpi();
+                        acpi_ht = 1; 
+                }
+                else if (!memcmp(from, "pci=noacpi", 10)) 
+                        acpi_disable_pci();
+                else if (!memcmp(from, "acpi=noirq", 10))
+                        acpi_noirq_set();
+                else if (!memcmp(from, "acpi_sci=edge", 13))
+                        acpi_sci_flags.trigger =  1;
+                else if (!memcmp(from, "acpi_sci=level", 14))
+                        acpi_sci_flags.trigger = 3;
+                else if (!memcmp(from, "acpi_sci=high", 13))
+                        acpi_sci_flags.polarity = 1;
+                else if (!memcmp(from, "acpi_sci=low", 12))
+                        acpi_sci_flags.polarity = 3;
+                /* acpi=strict disables out-of-spec workarounds */
+                else if (!memcmp(from, "acpi=strict", 11)) {
+                        acpi_strict = 1;
+                }
+#endif
+                if (!memcmp(from, "nolapic", 7) ||
+                    !memcmp(from, "disableapic", 11))
+                        disable_apic = 1;
+                if (!memcmp(from, "noapic", 6)) 
+                        skip_ioapic_setup = 1;
+                if (!memcmp(from, "apic", 4)) { 
+                        skip_ioapic_setup = 0;
+                        ioapic_force = 1;
+                }
+                        
+                if (!memcmp(from, "mem=", 4))
+                        parse_memopt(from+4, &from); 
+#ifdef CONFIG_DISCONTIGMEM
+                if (!memcmp(from, "numa=", 5))
+                        numa_setup(from+5); 
+#endif
+#ifdef CONFIG_GART_IOMMU 
+                if (!memcmp(from,"iommu=",6)) { 
+                        iommu_setup(from+6); 
+                }
+#endif
+                if (!memcmp(from,"oops=panic", 10))
+                        panic_on_oops = 1;
+                if (!memcmp(from, "noexec=", 7))
+                        nonx_setup(from + 7);
+        next_char:
+                c = *(from++);
+                if (!c)
+                        break;
+                if (COMMAND_LINE_SIZE <= ++len)
+                        break;
+                *(to++) = c;
+        }
+        *to = '\0';
+        *cmdline_p = command_line;
+}
+#ifndef CONFIG_DISCONTIGMEM
+static void __init contig_initmem_init(void)
+{
+        unsigned long bootmap_size, bootmap; 
+        bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+        bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
+        if (bootmap == -1L) 
+                panic("Cannot find bootmem map of size %ld\n",bootmap_size);
+        bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
+        e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT); 
+        reserve_bootmem(bootmap, bootmap_size);
+} 
+#endif
+/* Use inline assembly to define this because the nops are defined 
+   as inline assembly strings in the include files and we cannot 
+   get them easily into strings. */
+asm("\t.data\nk8nops: " 
+    K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
+    K8_NOP7 K8_NOP8); 
+    
+extern unsigned char k8nops[];
+static unsigned char *k8_nops[ASM_NOP_MAX+1] = { 
+     NULL,
+     k8nops,
+     k8nops + 1,
+     k8nops + 1 + 2,
+     k8nops + 1 + 2 + 3,
+     k8nops + 1 + 2 + 3 + 4,
+     k8nops + 1 + 2 + 3 + 4 + 5,
+     k8nops + 1 + 2 + 3 + 4 + 5 + 6,
+     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+}; 
+/* Replace instructions with better alternatives for this CPU type.
+   This runs before SMP is initialized to avoid SMP problems with
+   self modifying code. This implies that assymetric systems where
+   APs have less capabilities than the boot processor are not handled. 
+   In this case boot with "noreplacement". */ 
+void apply_alternatives(void *start, void *end) 
+{ 
+        struct alt_instr *a; 
+        int diff, i, k;
+        for (a = start; (void *)a < end; a++) { 
+                if (!boot_cpu_has(a->cpuid))
+                        continue;
+                BUG_ON(a->replacementlen > a->instrlen); 
+                __inline_memcpy(a->instr, a->replacement, a->replacementlen); 
+                diff = a->instrlen - a->replacementlen; 
+                /* Pad the rest with nops */
+                for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
+                        k = diff;
+                        if (k > ASM_NOP_MAX)
+                                k = ASM_NOP_MAX;
+                        __inline_memcpy(a->instr + i, k8_nops[k], k); 
+                } 
+        }
+} 
+static int no_replacement __initdata = 0; 
+ 
+void __init alternative_instructions(void)
+{
+        extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+        if (no_replacement) 
+                return;
+        apply_alternatives(__alt_instructions, __alt_instructions_end);
+}
+static int __init noreplacement_setup(char *s)
+{ 
+     no_replacement = 1; 
+     return 0; 
+} 
+__setup("noreplacement", noreplacement_setup); 
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ *              from boot_params into a safe place.
+ *
+ */
+static inline void copy_edd(void)
+{
+     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
+     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
+     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
+     edd.edd_info_nr = EDD_NR;
+}
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+#define EBDA_ADDR_POINTER 0x40E
+static void __init reserve_ebda_region(void)
+{
+        unsigned int addr;
+        /** 
+         * there is a real-mode segmented pointer pointing to the 
+         * 4K EBDA area at 0x40E
+         */
+        addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER);
+        addr <<= 4;
+        if (addr)
+                reserve_bootmem_generic(addr, PAGE_SIZE);
+}
+void __init setup_arch(char **cmdline_p)
+{
+        unsigned long low_mem_size;
+        unsigned long kernel_end;
+        ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
+        drive_info = DRIVE_INFO;
+        screen_info = SCREEN_INFO;
+        edid_info = EDID_INFO;
+        saved_video_mode = SAVED_VIDEO_MODE;
+        bootloader_type = LOADER_TYPE;
+#ifdef CONFIG_BLK_DEV_RAM
+        rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
+        rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
+        rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
+#endif
+        setup_memory_region();
+        copy_edd();
+        if (!MOUNT_ROOT_RDONLY)
+                root_mountflags &= ~MS_RDONLY;
+        init_mm.start_code = (unsigned long) &_text;
+        init_mm.end_code = (unsigned long) &_etext;
+        init_mm.end_data = (unsigned long) &_edata;
+        init_mm.brk = (unsigned long) &_end;
+        code_resource.start = virt_to_phys(&_text);
+        code_resource.end = virt_to_phys(&_etext)-1;
+        data_resource.start = virt_to_phys(&_etext);
+        data_resource.end = virt_to_phys(&_edata)-1;
+        parse_cmdline_early(cmdline_p);
+        early_identify_cpu(&boot_cpu_data);
+        /*
+         * partially used pages are not usable - thus
+         * we are rounding upwards:
+         */
+        end_pfn = e820_end_of_ram();
+        check_efer();
+        init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
+#ifdef CONFIG_ACPI_BOOT
+        /*
+         * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
+         * Call this early for SRAT node setup.
+         */
+        acpi_boot_table_init();
+#endif
+#ifdef CONFIG_ACPI_NUMA
+        /*
+         * Parse SRAT to discover nodes.
+         */
+        acpi_numa_init();
+#endif
+#ifdef CONFIG_DISCONTIGMEM
+        numa_initmem_init(0, end_pfn); 
+#else
+        contig_initmem_init(); 
+#endif
+        /* Reserve direct mapping */
+        reserve_bootmem_generic(table_start << PAGE_SHIFT, 
+                                (table_end - table_start) << PAGE_SHIFT);
+        /* reserve kernel */
+        kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE);
+        reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY);
+        /*
+         * reserve physical page 0 - it's a special BIOS page on many boxes,
+         * enabling clean reboots, SMP operation, laptop functions.
+         */
+        reserve_bootmem_generic(0, PAGE_SIZE);
+        /* reserve ebda region */
+        reserve_ebda_region();
+#ifdef CONFIG_SMP
+        /*
+         * But first pinch a few for the stack/trampoline stuff
+         * FIXME: Don't need the extra page at 4K, but need to fix
+         * trampoline before removing it. (see the GDT stuff)
+         */
+        reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
+        /* Reserve SMP trampoline */
+        reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
+#endif
+#ifdef CONFIG_ACPI_SLEEP
+       /*
+        * Reserve low memory region for sleep support.
+        */
+       acpi_reserve_bootmem();
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+        /*
+         * Find and reserve possible boot-time SMP configuration:
+         */
+        find_smp_config();
+#endif
+#ifdef CONFIG_BLK_DEV_INITRD
+        if (LOADER_TYPE && INITRD_START) {
+                if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
+                        reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
+                        initrd_start =
+                                INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
+                        initrd_end = initrd_start+INITRD_SIZE;
+                }
+                else {
+                        printk(KERN_ERR "initrd extends beyond end of memory "
+                            "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+                            (unsigned long)(INITRD_START + INITRD_SIZE),
+                            (unsigned long)(end_pfn << PAGE_SHIFT));
+                        initrd_start = 0;
+                }
+        }
+#endif
+        paging_init();
+        check_ioapic();
+#ifdef CONFIG_ACPI_BOOT
+        /*
+         * Read APIC and some other early information from ACPI tables.
+         */
+        acpi_boot_init();
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+        /*
+         * get boot-time SMP configuration:
+         */
+        if (smp_found_config)
+                get_smp_config();
+        init_apic_mappings();
+#endif
+        /*
+         * Request address space for all standard RAM and ROM resources
+         * and also for regions reported as reserved by the e820.
+         */
+        probe_roms();
+        e820_reserve_resources(); 
+        request_resource(&iomem_resource, &video_ram_resource);
+        {
+        unsigned i;
+        /* request I/O space for devices used on all i[345]86 PCs */
+        for (i = 0; i < STANDARD_IO_RESOURCES; i++)
+                request_resource(&ioport_resource, &standard_io_resources[i]);
+        }
+        /* Will likely break when you have unassigned resources with more
+           than 4GB memory and bridges that don't support more than 4GB. 
+           Doing it properly would require to use pci_alloc_consistent
+           in this case. */
+        low_mem_size = ((end_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff;
+        if (low_mem_size > pci_mem_start)
+                pci_mem_start = low_mem_size;
+#ifdef CONFIG_GART_IOMMU
+       iommu_hole_init();
+#endif
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+        conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+        conswitchp = &dummy_con;
+#endif
+#endif
+}
+static int __init get_model_name(struct cpuinfo_x86 *c)
+{
+        unsigned int *v;
+        if (c->x86_cpuid_level < 0x80000004)
+                return 0;
+        v = (unsigned int *) c->x86_model_id;
+        cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+        cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+        cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+        c->x86_model_id[48] = 0;
+        return 1;
+}
+static void __init display_cacheinfo(struct cpuinfo_x86 *c)
+{
+        unsigned int n, dummy, eax, ebx, ecx, edx;
+        n = c->x86_cpuid_level;
+        if (n >= 0x80000005) {
+                cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+                printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
+                        edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+                c->x86_cache_size=(ecx>>24)+(edx>>24);
+                /* On K8 L1 TLB is inclusive, so don't count it */
+                c->x86_tlbsize = 0;
+        }
+        if (n >= 0x80000006) {
+                cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+                ecx = cpuid_ecx(0x80000006);
+                c->x86_cache_size = ecx >> 16;
+                c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+                printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+                c->x86_cache_size, ecx & 0xFF);
+        }
+        if (n >= 0x80000007)
+                cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); 
+        if (n >= 0x80000008) {
+                cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 
+                c->x86_virt_bits = (eax >> 8) & 0xff;
+                c->x86_phys_bits = eax & 0xff;
+        }
+}
+static int __init init_amd(struct cpuinfo_x86 *c)
+{
+        int r;
+        int level;
+#ifdef CONFIG_NUMA
+        int cpu;
+#endif
+        /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+           3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+        clear_bit(0*32+31, &c->x86_capability);
+        
+        /* C-stepping K8? */
+        level = cpuid_eax(1);
+        if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+                set_bit(X86_FEATURE_K8_C, &c->x86_capability);
+        r = get_model_name(c);
+        if (!r) { 
+                switch (c->x86) { 
+                case 15:
+                        /* Should distinguish Models here, but this is only
+                           a fallback anyways. */
+                        strcpy(c->x86_model_id, "Hammer");
+                        break; 
+                } 
+        } 
+        display_cacheinfo(c);
+        if (c->x86_cpuid_level >= 0x80000008) {
+                c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
+                if (c->x86_num_cores & (c->x86_num_cores - 1))
+                        c->x86_num_cores = 1;
+#ifdef CONFIG_NUMA
+                /* On a dual core setup the lower bits of apic id
+                   distingush the cores. Fix up the CPU<->node mappings
+                   here based on that.
+                   Assumes number of cores is a power of two.
+                   When using SRAT use mapping from SRAT. */
+                cpu = c->x86_apicid;
+                if (acpi_numa <= 0 && c->x86_num_cores > 1) {
+                        cpu_to_node[cpu] = cpu >> hweight32(c->x86_num_cores - 1);
+                        if (!node_online(cpu_to_node[cpu]))
+                                cpu_to_node[cpu] = first_node(node_online_map);
+                }
+                printk(KERN_INFO "CPU %d(%d) -> Node %d\n",
+                                cpu, c->x86_num_cores, cpu_to_node[cpu]);
+#endif
+        }
+        return r;
+}
+static void __init detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+        u32     eax, ebx, ecx, edx;
+        int     index_lsb, index_msb, tmp;
+        int     cpu = smp_processor_id();
+        
+        if (!cpu_has(c, X86_FEATURE_HT))
+                return;
+        cpuid(1, &eax, &ebx, &ecx, &edx);
+        smp_num_siblings = (ebx & 0xff0000) >> 16;
+        
+        if (smp_num_siblings == 1) {
+                printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
+        } else if (smp_num_siblings > 1) {
+                index_lsb = 0;
+                index_msb = 31;
+                /*
+                 * At this point we only support two siblings per
+                 * processor package.
+                 */
+                if (smp_num_siblings > NR_CPUS) {
+                        printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
+                        smp_num_siblings = 1;
+                        return;
+                }
+                tmp = smp_num_siblings;
+                while ((tmp & 1) == 0) {
+                        tmp >>=1 ;
+                        index_lsb++;
+                }
+                tmp = smp_num_siblings;
+                while ((tmp & 0x80000000 ) == 0) {
+                        tmp <<=1 ;
+                        index_msb--;
+                }
+                if (index_lsb != index_msb )
+                        index_msb++;
+                phys_proc_id[cpu] = phys_pkg_id(index_msb);
+                
+                printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+                       phys_proc_id[cpu]);
+        }
+#endif
+}
+static void __init sched_cmp_hack(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+        /* AMD dual core looks like HT but isn't really. Hide it from the
+           scheduler. This works around problems with the domain scheduler.
+           Also probably gives slightly better scheduling and disables
+           SMT nice which is harmful on dual core.
+           TBD tune the domain scheduler for dual core. */
+        if (c->x86_vendor == X86_VENDOR_AMD && cpu_has(c, X86_FEATURE_CMP_LEGACY))
+                smp_num_siblings = 1;
+#endif
+}
+        
+static void __init init_intel(struct cpuinfo_x86 *c)
+{
+        /* Cache sizes */
+        unsigned n;
+        init_intel_cacheinfo(c);
+        n = c->x86_cpuid_level;
+        if (n >= 0x80000008) {
+                unsigned eax = cpuid_eax(0x80000008);
+                c->x86_virt_bits = (eax >> 8) & 0xff;
+                c->x86_phys_bits = eax & 0xff;
+        }
+        if (c->x86 == 15)
+                c->x86_cache_alignment = c->x86_clflush_size * 2;
+}
+void __init get_cpu_vendor(struct cpuinfo_x86 *c)
+{
+        char *v = c->x86_vendor_id;
+        if (!strcmp(v, "AuthenticAMD"))
+                c->x86_vendor = X86_VENDOR_AMD;
+        else if (!strcmp(v, "GenuineIntel"))
+                c->x86_vendor = X86_VENDOR_INTEL;
+        else
+                c->x86_vendor = X86_VENDOR_UNKNOWN;
+}
+struct cpu_model_info {
+        int vendor;
+        int family;
+        char *model_names[16];
+};
+/* Do some early cpuid on the boot CPU to get some parameter that are
+   needed before check_bugs. Everything advanced is in identify_cpu
+   below. */
+void __init early_identify_cpu(struct cpuinfo_x86 *c)
+{
+        u32 tfms;
+        c->loops_per_jiffy = loops_per_jiffy;
+        c->x86_cache_size = -1;
+        c->x86_vendor = X86_VENDOR_UNKNOWN;
+        c->x86_model = c->x86_mask = 0; /* So far unknown... */
+        c->x86_vendor_id[0] = '\0'; /* Unset */
+        c->x86_model_id[0] = '\0';  /* Unset */
+        c->x86_clflush_size = 64;
+        c->x86_cache_alignment = c->x86_clflush_size;
+        c->x86_num_cores = 1;
+        c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data;
+        c->x86_cpuid_level = 0;
+        memset(&c->x86_capability, 0, sizeof c->x86_capability);
+        /* Get vendor name */
+        cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+              (unsigned int *)&c->x86_vendor_id[0],
+              (unsigned int *)&c->x86_vendor_id[8],
+              (unsigned int *)&c->x86_vendor_id[4]);
+                
+        get_cpu_vendor(c);
+        /* Initialize the standard set of capabilities */
+        /* Note that the vendor-specific code below might override */
+        /* Intel-defined flags: level 0x00000001 */
+        if (c->cpuid_level >= 0x00000001) {
+                __u32 misc;
+                cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
+                      &c->x86_capability[0]);
+                c->x86 = (tfms >> 8) & 0xf;
+                c->x86_model = (tfms >> 4) & 0xf;
+                c->x86_mask = tfms & 0xf;
+                if (c->x86 == 0xf) {
+                        c->x86 += (tfms >> 20) & 0xff;
+                        c->x86_model += ((tfms >> 16) & 0xF) << 4;
+                } 
+                if (c->x86_capability[0] & (1<<19)) 
+                        c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+                c->x86_apicid = misc >> 24;
+        } else {
+                /* Have CPUID level 0 only - unheard of */
+                c->x86 = 4;
+        }
+}
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+void __init identify_cpu(struct cpuinfo_x86 *c)
+{
+        int i;
+        u32 xlvl;
+        early_identify_cpu(c);
+        /* AMD-defined flags: level 0x80000001 */
+        xlvl = cpuid_eax(0x80000000);
+        c->x86_cpuid_level = xlvl;
+        if ((xlvl & 0xffff0000) == 0x80000000) {
+                if (xlvl >= 0x80000001) {
+                        c->x86_capability[1] = cpuid_edx(0x80000001);
+                        c->x86_capability[5] = cpuid_ecx(0x80000001);
+                }
+                if (xlvl >= 0x80000004)
+                        get_model_name(c); /* Default name */
+        }
+        /* Transmeta-defined flags: level 0x80860001 */
+        xlvl = cpuid_eax(0x80860000);
+        if ((xlvl & 0xffff0000) == 0x80860000) {
+                /* Don't set x86_cpuid_level here for now to not confuse. */
+                if (xlvl >= 0x80860001)
+                        c->x86_capability[2] = cpuid_edx(0x80860001);
+        }
+        /*
+         * Vendor-specific initialization.  In this section we
+         * canonicalize the feature flags, meaning if there are
+         * features a certain CPU supports which CPUID doesn't
+         * tell us, CPUID claiming incorrect flags, or other bugs,
+         * we handle them here.
+         *
+         * At the end of this section, c->x86_capability better
+         * indicate the features this CPU genuinely supports!
+         */
+        switch (c->x86_vendor) {
+        case X86_VENDOR_AMD:
+                init_amd(c);
+                break;
+        case X86_VENDOR_INTEL:
+                init_intel(c);
+                break;
+        case X86_VENDOR_UNKNOWN:
+        default:
+                display_cacheinfo(c);
+                break;
+        }
+        select_idle_routine(c);
+        detect_ht(c); 
+        sched_cmp_hack(c);
+        /*
+         * On SMP, boot_cpu_data holds the common feature set between
+         * all CPUs; so make sure that we indicate which features are
+         * common between the CPUs.  The first time this routine gets
+         * executed, c == &boot_cpu_data.
+         */
+        if (c != &boot_cpu_data) {
+                /* AND the already accumulated flags with these */
+                for (i = 0 ; i < NCAPINTS ; i++)
+                        boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+        }
+#ifdef CONFIG_X86_MCE
+        mcheck_init(c);
+#endif
+#ifdef CONFIG_NUMA
+        if (c != &boot_cpu_data)
+                numa_add_cpu(c - cpu_data);
+#endif
+}
+ 
+void __init print_cpu_info(struct cpuinfo_x86 *c)
+{
+        if (c->x86_model_id[0])
+                printk("%s", c->x86_model_id);
+        if (c->x86_mask || c->cpuid_level >= 0) 
+                printk(" stepping %02x\n", c->x86_mask);
+        else
+                printk("\n");
+}
+/*
+ *      Get CPU information for use by the procfs.
+ */
+static int show_cpuinfo(struct seq_file *m, void *v)
+{
+        struct cpuinfo_x86 *c = v;
+        /* 
+         * These flag bits must match the definitions in <asm/cpufeature.h>.
+         * NULL means this bit is undefined or reserved; either way it doesn't
+         * have meaning as far as Linux is concerned.  Note that it's important
+         * to realize there is a difference between this table and CPUID -- if
+         * applications want to get the raw CPUID data, they should access
+         * /dev/cpu/<cpu_nr>/cpuid instead.
+         */
+        static char *x86_cap_flags[] = {
+                /* Intel-defined */
+                "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
+                "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
+                "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
+                "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
+                /* AMD-defined */
+                "pni", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
+                NULL, "fxsr_opt", NULL, NULL, NULL, "lm", "3dnowext", "3dnow",
+                /* Transmeta-defined */
+                "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                /* Other (Linux-defined) */
+                "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                /* Intel-defined (#2) */
+                "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est",
+                "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                /* AMD-defined (#2) */
+                "lahf_lm", "cmp_legacy", NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
+        };
+        static char *x86_power_flags[] = { 
+                "ts",   /* temperature sensor */
+                "fid",  /* frequency id control */
+                "vid",  /* voltage id control */
+                "ttp",  /* thermal trip */
+                "tm",
+                "stc"
+        };
+#ifdef CONFIG_SMP
+        if (!cpu_online(c-cpu_data))
+                return 0;
+#endif
+        seq_printf(m,"processor\t: %u\n"
+                     "vendor_id\t: %s\n"
+                     "cpu family\t: %d\n"
+                     "model\t\t: %d\n"
+                     "model name\t: %s\n",
+                     (unsigned)(c-cpu_data),
+                     c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
+                     c->x86,
+                     (int)c->x86_model,
+                     c->x86_model_id[0] ? c->x86_model_id : "unknown");
+        
+        if (c->x86_mask || c->cpuid_level >= 0)
+                seq_printf(m, "stepping\t: %d\n", c->x86_mask);
+        else
+                seq_printf(m, "stepping\t: unknown\n");
+        
+        if (cpu_has(c,X86_FEATURE_TSC)) {
+                seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
+                             cpu_khz / 1000, (cpu_khz % 1000));
+        }
+        /* Cache size */
+        if (c->x86_cache_size >= 0) 
+                seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
+        
+#ifdef CONFIG_SMP
+        seq_printf(m, "physical id\t: %d\n", phys_proc_id[c - cpu_data]);
+        seq_printf(m, "siblings\t: %d\n", c->x86_num_cores * smp_num_siblings);
+#endif  
+        seq_printf(m,
+                "fpu\t\t: yes\n"
+                "fpu_exception\t: yes\n"
+                "cpuid level\t: %d\n"
+                "wp\t\t: yes\n"
+                "flags\t\t:",
+                   c->cpuid_level);
+        { 
+                int i; 
+                for ( i = 0 ; i < 32*NCAPINTS ; i++ )
+                        if ( test_bit(i, &c->x86_capability) &&
+                             x86_cap_flags[i] != NULL )
+                                seq_printf(m, " %s", x86_cap_flags[i]);
+        }
+                
+        seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+                   c->loops_per_jiffy/(500000/HZ),
+                   (c->loops_per_jiffy/(5000/HZ)) % 100);
+        if (c->x86_tlbsize > 0) 
+                seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
+        seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
+        seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
+        seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 
+                   c->x86_phys_bits, c->x86_virt_bits);
+        seq_printf(m, "power management:");
+        {
+                unsigned i;
+                for (i = 0; i < 32; i++) 
+                        if (c->x86_power & (1 << i)) {
+                                if (i < ARRAY_SIZE(x86_power_flags))
+                                        seq_printf(m, " %s", x86_power_flags[i]);
+                                else
+                                        seq_printf(m, " [%d]", i);
+                        }
+        }
+        seq_printf(m, "\n");
+        if (c->x86_num_cores > 1)
+                seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores);
+        seq_printf(m, "\n\n"); 
+        return 0;
+}
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+        return *pos < NR_CPUS ? cpu_data + *pos : NULL;
+}
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        ++*pos;
+        return c_start(m, pos);
+}
+static void c_stop(struct seq_file *m, void *v)
+{
+}
+struct seq_operations cpuinfo_op = {
+        .start =c_start,
+        .next = c_next,
+        .stop = c_stop,
+        .show = show_cpuinfo,
+};
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
new file mode 100644
index 000000000000..b5305b04bc40
--- /dev/null
+++ b/arch/x86_64/kernel/setup64.c
@@ -0,0 +1,292 @@
+/* 
+ * X86-64 specific CPU setup.
+ * Copyright (C) 1995  Linus Torvalds
+ * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
+ * See setup.c for older changelog.
+ * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $
+ */ 
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/bitops.h>
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/mmu_context.h>
+#include <asm/smp.h>
+#include <asm/i387.h>
+#include <asm/percpu.h>
+#include <asm/mtrr.h>
+#include <asm/proto.h>
+#include <asm/mman.h>
+#include <asm/numa.h>
+char x86_boot_params[2048] __initdata = {0,};
+cpumask_t cpu_initialized __initdata = CPU_MASK_NONE;
+struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; 
+extern struct task_struct init_task;
+extern unsigned char __per_cpu_start[], __per_cpu_end[]; 
+extern struct desc_ptr cpu_gdt_descr[];
+struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; 
+char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
+unsigned long __supported_pte_mask = ~0UL;
+static int do_not_nx __initdata = 0;
+/* noexec=on|off
+Control non executable mappings for 64bit processes.
+on      Enable(default)
+off     Disable
+*/ 
+int __init nonx_setup(char *str)
+{
+        if (!strncmp(str, "on", 2)) {
+                __supported_pte_mask |= _PAGE_NX; 
+                do_not_nx = 0; 
+        } else if (!strncmp(str, "off", 3)) {
+                do_not_nx = 1;
+                __supported_pte_mask &= ~_PAGE_NX;
+        }
+        return 0;
+} 
+__setup("noexec=", nonx_setup); /* parsed early actually */
+int force_personality32 = READ_IMPLIES_EXEC;
+/* noexec32=on|off
+Control non executable heap for 32bit processes.
+To control the stack too use noexec=off
+on      PROT_READ does not imply PROT_EXEC for 32bit processes
+off     PROT_READ implies PROT_EXEC (default)
+*/
+static int __init nonx32_setup(char *str)
+{
+        if (!strcmp(str, "on"))
+                force_personality32 &= ~READ_IMPLIES_EXEC;
+        else if (!strcmp(str, "off"))
+                force_personality32 |= READ_IMPLIES_EXEC;
+        return 0;
+}
+__setup("noexec32=", nonx32_setup);
+/*
+ * Great future plan:
+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
+ * Always point %gs to its beginning
+ */
+void __init setup_per_cpu_areas(void)
+{ 
+        int i;
+        unsigned long size;
+        /* Copy section for each CPU (we discard the original) */
+        size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
+#ifdef CONFIG_MODULES
+        if (size < PERCPU_ENOUGH_ROOM)
+                size = PERCPU_ENOUGH_ROOM;
+#endif
+        for (i = 0; i < NR_CPUS; i++) { 
+                unsigned char *ptr;
+                if (!NODE_DATA(cpu_to_node(i))) {
+                        printk("cpu with no node %d, num_online_nodes %d\n",
+                               i, num_online_nodes());
+                        ptr = alloc_bootmem(size);
+                } else { 
+                        ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
+                }
+                if (!ptr)
+                        panic("Cannot allocate cpu data for CPU %d\n", i);
+                cpu_pda[i].data_offset = ptr - __per_cpu_start;
+                memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+        }
+} 
+void pda_init(int cpu)
+{ 
+        struct x8664_pda *pda = &cpu_pda[cpu];
+        /* Setup up data that may be needed in __get_free_pages early */
+        asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
+        wrmsrl(MSR_GS_BASE, cpu_pda + cpu);
+        pda->me = pda;
+        pda->cpunumber = cpu; 
+        pda->irqcount = -1;
+        pda->kernelstack = 
+                (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 
+        pda->active_mm = &init_mm;
+        pda->mmu_state = 0;
+        if (cpu == 0) {
+                /* others are initialized in smpboot.c */
+                pda->pcurrent = &init_task;
+                pda->irqstackptr = boot_cpu_stack; 
+        } else {
+                pda->irqstackptr = (char *)
+                        __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
+                if (!pda->irqstackptr)
+                        panic("cannot allocate irqstack for cpu %d", cpu); 
+        }
+        asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
+        pda->irqstackptr += IRQSTACKSIZE-64;
+} 
+char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] 
+__attribute__((section(".bss.page_aligned")));
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+        /* 
+         * LSTAR and STAR live in a bit strange symbiosis.
+         * They both write to the same internal register. STAR allows to set CS/DS
+         * but only a 32bit target. LSTAR sets the 64bit rip.    
+         */ 
+        wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); 
+        wrmsrl(MSR_LSTAR, system_call); 
+#ifdef CONFIG_IA32_EMULATION            
+        syscall32_cpu_init ();
+#endif
+        /* Flags to clear on syscall */
+        wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
+}
+void __init check_efer(void)
+{
+        unsigned long efer;
+        rdmsrl(MSR_EFER, efer); 
+        if (!(efer & EFER_NX) || do_not_nx) { 
+                __supported_pte_mask &= ~_PAGE_NX; 
+        }       
+}
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init.
+ */
+void __init cpu_init (void)
+{
+#ifdef CONFIG_SMP
+        int cpu = stack_smp_processor_id();
+#else
+        int cpu = smp_processor_id();
+#endif
+        struct tss_struct *t = &per_cpu(init_tss, cpu);
+        unsigned long v; 
+        char *estacks = NULL; 
+        struct task_struct *me;
+        int i;
+        /* CPU 0 is initialised in head64.c */
+        if (cpu != 0) {
+                pda_init(cpu);
+        } else 
+                estacks = boot_exception_stacks; 
+        me = current;
+        if (cpu_test_and_set(cpu, cpu_initialized))
+                panic("CPU#%d already initialized!\n", cpu);
+        printk("Initializing CPU#%d\n", cpu);
+                clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+        /*
+         * Initialize the per-CPU GDT with the boot GDT,
+         * and set up the GDT descriptor:
+         */
+        if (cpu) {
+                memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE);
+        }       
+        cpu_gdt_descr[cpu].size = GDT_SIZE;
+        cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu];
+        asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
+        asm volatile("lidt %0" :: "m" (idt_descr));
+        memcpy(me->thread.tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8);
+        /*
+         * Delete NT
+         */
+        asm volatile("pushfq ; popq %%rax ; btr $14,%%rax ; pushq %%rax ; popfq" ::: "eax");
+        syscall_init();
+        wrmsrl(MSR_FS_BASE, 0);
+        wrmsrl(MSR_KERNEL_GS_BASE, 0);
+        barrier(); 
+        check_efer();
+        /*
+         * set up and load the per-CPU TSS
+         */
+        for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+                if (cpu) {
+                        estacks = (char *)__get_free_pages(GFP_ATOMIC, 
+                                                   EXCEPTION_STACK_ORDER);
+                        if (!estacks)
+                                panic("Cannot allocate exception stack %ld %d\n",
+                                      v, cpu); 
+                }
+                estacks += EXCEPTION_STKSZ;
+                t->ist[v] = (unsigned long)estacks;
+        }
+        t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+        /*
+         * <= is required because the CPU will access up to
+         * 8 bits beyond the end of the IO permission bitmap.
+         */
+        for (i = 0; i <= IO_BITMAP_LONGS; i++)
+                t->io_bitmap[i] = ~0UL;
+        atomic_inc(&init_mm.mm_count);
+        me->active_mm = &init_mm;
+        if (me->mm)
+                BUG();
+        enter_lazy_tlb(&init_mm, me);
+        set_tss_desc(cpu, t);
+        load_TR_desc();
+        load_LDT(&init_mm.context);
+        /*
+         * Clear all 6 debug registers:
+         */
+        set_debug(0UL, 0);
+        set_debug(0UL, 1);
+        set_debug(0UL, 2);
+        set_debug(0UL, 3);
+        set_debug(0UL, 6);
+        set_debug(0UL, 7);
+        fpu_init(); 
+}
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
new file mode 100644
index 000000000000..5ace32c6ac5c
--- /dev/null
+++ b/arch/x86_64/kernel/signal.c
@@ -0,0 +1,486 @@
+/*
+ *  linux/arch/x86_64/kernel/signal.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ *
+ *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
+ *  2000-2002   x86-64 support by Andi Kleen
+ * 
+ *  $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/ptrace.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/compiler.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include <asm/proto.h>
+/* #define DEBUG_SIG 1 */
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+               sigset_t *set, struct pt_regs * regs); 
+void ia32_setup_frame(int sig, struct k_sigaction *ka,
+            sigset_t *set, struct pt_regs * regs); 
+asmlinkage long
+sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs)
+{
+        sigset_t saveset, newset;
+        /* XXX: Don't preclude handling different sized sigset_t's.  */
+        if (sigsetsize != sizeof(sigset_t))
+                return -EINVAL;
+        if (copy_from_user(&newset, unewset, sizeof(newset)))
+                return -EFAULT;
+        sigdelsetmask(&newset, ~_BLOCKABLE);
+        spin_lock_irq(&current->sighand->siglock);
+        saveset = current->blocked;
+        current->blocked = newset;
+        recalc_sigpending();
+        spin_unlock_irq(&current->sighand->siglock);
+#ifdef DEBUG_SIG
+        printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n",
+                saveset, newset, regs, regs->rip);
+#endif 
+        regs->rax = -EINTR;
+        while (1) {
+                current->state = TASK_INTERRUPTIBLE;
+                schedule();
+                if (do_signal(regs, &saveset))
+                        return -EINTR;
+        }
+}
+asmlinkage long
+sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
+                struct pt_regs *regs)
+{
+        return do_sigaltstack(uss, uoss, regs->rsp);
+}
+/*
+ * Do a signal return; undo the signal stack.
+ */
+struct rt_sigframe
+{
+        char *pretcode;
+        struct ucontext uc;
+        struct siginfo info;
+};
+static int
+restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax)
+{
+        unsigned int err = 0;
+        /* Always make any pending restarted system calls return -EINTR */
+        current_thread_info()->restart_block.fn = do_no_restart_syscall;
+#define COPY(x)         err |= __get_user(regs->x, &sc->x)
+        COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
+        COPY(rdx); COPY(rcx); COPY(rip);
+        COPY(r8);
+        COPY(r9);
+        COPY(r10);
+        COPY(r11);
+        COPY(r12);
+        COPY(r13);
+        COPY(r14);
+        COPY(r15);
+        {
+                unsigned int tmpflags;
+                err |= __get_user(tmpflags, &sc->eflags);
+                regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
+                regs->orig_rax = -1;            /* disable syscall checks */
+        }
+        {
+                struct _fpstate __user * buf;
+                err |= __get_user(buf, &sc->fpstate);
+                if (buf) {
+                        if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
+                                goto badframe;
+                        err |= restore_i387(buf);
+                } else {
+                        struct task_struct *me = current;
+                        if (used_math()) {
+                                clear_fpu(me);
+                                clear_used_math();
+                        }
+                }
+        }
+        err |= __get_user(*prax, &sc->rax);
+        return err;
+badframe:
+        return 1;
+}
+asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
+{
+        struct rt_sigframe __user *frame;
+        sigset_t set;
+        unsigned long eax;
+        frame = (struct rt_sigframe __user *)(regs->rsp - 8);
+        if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
+                goto badframe;
+        } 
+        if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { 
+                goto badframe;
+        } 
+        sigdelsetmask(&set, ~_BLOCKABLE);
+        spin_lock_irq(&current->sighand->siglock);
+        current->blocked = set;
+        recalc_sigpending();
+        spin_unlock_irq(&current->sighand->siglock);
+        
+        if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+                goto badframe;
+#ifdef DEBUG_SIG
+        printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax);
+#endif
+        if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
+                goto badframe;
+        return eax;
+badframe:
+        signal_fault(regs,frame,"sigreturn");
+        return 0;
+}       
+/*
+ * Set up a signal frame.
+ */
+static inline int
+setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
+{
+        int err = 0;
+        unsigned long eflags;
+        err |= __put_user(0, &sc->gs);
+        err |= __put_user(0, &sc->fs);
+        err |= __put_user(regs->rdi, &sc->rdi);
+        err |= __put_user(regs->rsi, &sc->rsi);
+        err |= __put_user(regs->rbp, &sc->rbp);
+        err |= __put_user(regs->rsp, &sc->rsp);
+        err |= __put_user(regs->rbx, &sc->rbx);
+        err |= __put_user(regs->rdx, &sc->rdx);
+        err |= __put_user(regs->rcx, &sc->rcx);
+        err |= __put_user(regs->rax, &sc->rax);
+        err |= __put_user(regs->r8, &sc->r8);
+        err |= __put_user(regs->r9, &sc->r9);
+        err |= __put_user(regs->r10, &sc->r10);
+        err |= __put_user(regs->r11, &sc->r11);
+        err |= __put_user(regs->r12, &sc->r12);
+        err |= __put_user(regs->r13, &sc->r13);
+        err |= __put_user(regs->r14, &sc->r14);
+        err |= __put_user(regs->r15, &sc->r15);
+        err |= __put_user(me->thread.trap_no, &sc->trapno);
+        err |= __put_user(me->thread.error_code, &sc->err);
+        err |= __put_user(regs->rip, &sc->rip);
+        eflags = regs->eflags;
+        if (current->ptrace & PT_PTRACED) {
+                eflags &= ~TF_MASK;
+        }
+        err |= __put_user(eflags, &sc->eflags);
+        err |= __put_user(mask, &sc->oldmask);
+        err |= __put_user(me->thread.cr2, &sc->cr2);
+        return err;
+}
+/*
+ * Determine which stack to use..
+ */
+static void __user *
+get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
+{
+        unsigned long rsp;
+        /* Default to using normal stack - redzone*/
+        rsp = regs->rsp - 128;
+        /* This is the X/Open sanctioned signal stack switching.  */
+        /* RED-PEN: redzone on that stack? */
+        if (ka->sa.sa_flags & SA_ONSTACK) {
+                if (sas_ss_flags(rsp) == 0)
+                        rsp = current->sas_ss_sp + current->sas_ss_size;
+        }
+        return (void __user *)round_down(rsp - size, 16); 
+}
+static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+                           sigset_t *set, struct pt_regs * regs)
+{
+        struct rt_sigframe __user *frame;
+        struct _fpstate __user *fp = NULL; 
+        int err = 0;
+        struct task_struct *me = current;
+        if (used_math()) {
+                fp = get_stack(ka, regs, sizeof(struct _fpstate)); 
+                frame = (void __user *)round_down(
+                        (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
+                if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
+                        goto give_sigsegv;
+                if (save_i387(fp) < 0) 
+                        err |= -1; 
+        } else
+                frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
+        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+                goto give_sigsegv;
+        if (ka->sa.sa_flags & SA_SIGINFO) { 
+                err |= copy_siginfo_to_user(&frame->info, info);
+                if (err)
+                        goto give_sigsegv;
+        }
+                
+        /* Create the ucontext.  */
+        err |= __put_user(0, &frame->uc.uc_flags);
+        err |= __put_user(0, &frame->uc.uc_link);
+        err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+        err |= __put_user(sas_ss_flags(regs->rsp),
+                          &frame->uc.uc_stack.ss_flags);
+        err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
+        err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
+        err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
+        if (sizeof(*set) == 16) { 
+                __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
+                __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 
+        } else
+                err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+        /* Set up to return from userspace.  If provided, use a stub
+           already in userspace.  */
+        /* x86-64 should always use SA_RESTORER. */
+        if (ka->sa.sa_flags & SA_RESTORER) {
+                err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
+        } else {
+                /* could use a vstub here */
+                goto give_sigsegv; 
+        }
+        if (err)
+                goto give_sigsegv;
+#ifdef DEBUG_SIG
+        printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
+#endif
+        /* Set up registers for signal handler */
+        { 
+                struct exec_domain *ed = current_thread_info()->exec_domain;
+                if (unlikely(ed && ed->signal_invmap && sig < 32))
+                        sig = ed->signal_invmap[sig];
+        } 
+        regs->rdi = sig;
+        /* In case the signal handler was declared without prototypes */ 
+        regs->rax = 0;  
+        /* This also works for non SA_SIGINFO handlers because they expect the
+           next argument after the signal number on the stack. */
+        regs->rsi = (unsigned long)&frame->info; 
+        regs->rdx = (unsigned long)&frame->uc; 
+        regs->rip = (unsigned long) ka->sa.sa_handler;
+        regs->rsp = (unsigned long)frame;
+        set_fs(USER_DS);
+        if (regs->eflags & TF_MASK) {
+                if ((current->ptrace & (PT_PTRACED | PT_DTRACE)) == (PT_PTRACED | PT_DTRACE)) {
+                        ptrace_notify(SIGTRAP);
+                } else {
+                        regs->eflags &= ~TF_MASK;
+                }
+        }
+#ifdef DEBUG_SIG
+        printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+                current->comm, current->pid, frame, regs->rip, frame->pretcode);
+#endif
+        return;
+give_sigsegv:
+        force_sigsegv(sig, current);
+}
+/*
+ * OK, we're invoking a handler
+ */     
+static void
+handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
+                sigset_t *oldset, struct pt_regs *regs)
+{
+#ifdef DEBUG_SIG
+        printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
+                current->pid, sig,
+                regs->rip, regs->rsp, regs);
+#endif
+        /* Are we from a system call? */
+        if ((long)regs->orig_rax >= 0) {
+                /* If so, check system call restarting.. */
+                switch (regs->rax) {
+                        case -ERESTART_RESTARTBLOCK:
+                        case -ERESTARTNOHAND:
+                                regs->rax = -EINTR;
+                                break;
+                        case -ERESTARTSYS:
+                                if (!(ka->sa.sa_flags & SA_RESTART)) {
+                                        regs->rax = -EINTR;
+                                        break;
+                                }
+                                /* fallthrough */
+                        case -ERESTARTNOINTR:
+                                regs->rax = regs->orig_rax;
+                                regs->rip -= 2;
+                                break;
+                }
+        }
+#ifdef CONFIG_IA32_EMULATION
+        if (test_thread_flag(TIF_IA32)) {
+                if (ka->sa.sa_flags & SA_SIGINFO)
+                        ia32_setup_rt_frame(sig, ka, info, oldset, regs);
+                else
+                        ia32_setup_frame(sig, ka, oldset, regs);
+        } else 
+#endif
+        setup_rt_frame(sig, ka, info, oldset, regs);
+        if (!(ka->sa.sa_flags & SA_NODEFER)) {
+                spin_lock_irq(&current->sighand->siglock);
+                sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
+                sigaddset(&current->blocked,sig);
+                recalc_sigpending();
+                spin_unlock_irq(&current->sighand->siglock);
+        }
+}
+/*
+ * Note that 'init' is a special process: it doesn't get signals it doesn't
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+int do_signal(struct pt_regs *regs, sigset_t *oldset)
+{
+        struct k_sigaction ka;
+        siginfo_t info;
+        int signr;
+        /*
+         * We want the common case to go fast, which
+         * is why we may in certain cases get here from
+         * kernel mode. Just return without doing anything
+         * if so.
+         */
+        if ((regs->cs & 3) != 3)
+                return 1;
+        if (try_to_freeze(0))
+                goto no_signal;
+        if (!oldset)
+                oldset = &current->blocked;
+        signr = get_signal_to_deliver(&info, &ka, regs, NULL);
+        if (signr > 0) {
+                /* Reenable any watchpoints before delivering the
+                 * signal to user space. The processor register will
+                 * have been cleared if the watchpoint triggered
+                 * inside the kernel.
+                 */
+                if (current->thread.debugreg7)
+                        asm volatile("movq %0,%%db7"    : : "r" (current->thread.debugreg7));
+                /* Whee!  Actually deliver the signal.  */
+                handle_signal(signr, &info, &ka, oldset, regs);
+                return 1;
+        }
+ no_signal:
+        /* Did we come from a system call? */
+        if ((long)regs->orig_rax >= 0) {
+                /* Restart the system call - no handlers present */
+                long res = regs->rax;
+                if (res == -ERESTARTNOHAND ||
+                    res == -ERESTARTSYS ||
+                    res == -ERESTARTNOINTR) {
+                        regs->rax = regs->orig_rax;
+                        regs->rip -= 2;
+                }
+                if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) {
+                        regs->rax = __NR_restart_syscall;
+                        regs->rip -= 2;
+                }
+        }
+        return 0;
+}
+void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags)
+{
+#ifdef DEBUG_SIG
+        printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n",
+               thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); 
+#endif
+               
+        /* Pending single-step? */
+        if (thread_info_flags & _TIF_SINGLESTEP) {
+                regs->eflags |= TF_MASK;
+                clear_thread_flag(TIF_SINGLESTEP);
+        }
+        /* deal with pending signal delivery */
+        if (thread_info_flags & _TIF_SIGPENDING)
+                do_signal(regs,oldset);
+}
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
+{ 
+        struct task_struct *me = current; 
+        if (exception_trace)
+                printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
+               me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); 
+        force_sig(SIGSEGV, me); 
+} 
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
new file mode 100644
index 000000000000..e3cdbf9a88bd
--- /dev/null
+++ b/arch/x86_64/kernel/smp.c
@@ -0,0 +1,415 @@
+/*
+ *      Intel SMP support routines.
+ *
+ *      (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *      (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ *      (c) 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ *      This code is released under the GNU General Public License version 2 or
+ *      later.
+ */
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/smp_lock.h>
+#include <linux/smp.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/interrupt.h>
+#include <asm/mtrr.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mach_apic.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+/*
+ *      Smarter SMP flushing macros. 
+ *              c/o Linus Torvalds.
+ *
+ *      These mean you can really definitely utterly forget about
+ *      writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *      Optimizations Manfred Spraul <manfred@colorfullife.com>
+ */
+static cpumask_t flush_cpumask;
+static struct mm_struct * flush_mm;
+static unsigned long flush_va;
+static DEFINE_SPINLOCK(tlbstate_lock);
+#define FLUSH_ALL       -1ULL
+/*
+ * We cannot call mmdrop() because we are in interrupt context, 
+ * instead update mm->cpu_vm_mask.
+ */
+static inline void leave_mm (unsigned long cpu)
+{
+        if (read_pda(mmu_state) == TLBSTATE_OK)
+                BUG();
+        clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask);
+        load_cr3(swapper_pg_dir);
+}
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask);
+ *      Stop ipi delivery for the old mm. This is not synchronized with
+ *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *      for the wrong mm, and in the worst case we perform a superfluous
+ *      tlb flush.
+ * 1a2) set cpu mmu_state to TLBSTATE_OK
+ *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *      was in lazy tlb mode.
+ * 1a3) update cpu active_mm
+ *      Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask);
+ *      Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ *      cpu active_mm is correct, cpu0 already handles
+ *      flush ipis.
+ * 1b1) set cpu mmu_state to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ *      Atomically set the bit [other cpus will start sending flush ipis],
+ *      and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu mmu_state is local to each cpu, no
+ * write/read ordering problems.
+ */
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ */
+asmlinkage void smp_invalidate_interrupt (void)
+{
+        unsigned long cpu;
+        cpu = get_cpu();
+        if (!cpu_isset(cpu, flush_cpumask))
+                goto out;
+                /* 
+                 * This was a BUG() but until someone can quote me the
+                 * line from the intel manual that guarantees an IPI to
+                 * multiple CPUs is retried _only_ on the erroring CPUs
+                 * its staying as a return
+                 *
+                 * BUG();
+                 */
+                 
+        if (flush_mm == read_pda(active_mm)) {
+                if (read_pda(mmu_state) == TLBSTATE_OK) {
+                        if (flush_va == FLUSH_ALL)
+                                local_flush_tlb();
+                        else
+                                __flush_tlb_one(flush_va);
+                } else
+                        leave_mm(cpu);
+        }
+        ack_APIC_irq();
+        cpu_clear(cpu, flush_cpumask);
+out:
+        put_cpu_no_resched();
+}
+static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
+                                                unsigned long va)
+{
+        cpumask_t tmp;
+        /*
+         * A couple of (to be removed) sanity checks:
+         *
+         * - we do not send IPIs to not-yet booted CPUs.
+         * - current CPU must not be in mask
+         * - mask must exist :)
+         */
+        BUG_ON(cpus_empty(cpumask));
+        cpus_and(tmp, cpumask, cpu_online_map);
+        BUG_ON(!cpus_equal(tmp, cpumask));
+        BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+        if (!mm)
+                BUG();
+        /*
+         * I'm not happy about this global shared spinlock in the
+         * MM hot path, but we'll see how contended it is.
+         * Temporarily this turns IRQs off, so that lockups are
+         * detected by the NMI watchdog.
+         */
+        spin_lock(&tlbstate_lock);
+        
+        flush_mm = mm;
+        flush_va = va;
+        cpus_or(flush_cpumask, cpumask, flush_cpumask);
+        /*
+         * We have to send the IPI only to
+         * CPUs affected.
+         */
+        send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+        while (!cpus_empty(flush_cpumask))
+                mb();   /* nothing. lockup detection does not belong here */;
+        flush_mm = NULL;
+        flush_va = 0;
+        spin_unlock(&tlbstate_lock);
+}
+        
+void flush_tlb_current_task(void)
+{
+        struct mm_struct *mm = current->mm;
+        cpumask_t cpu_mask;
+        preempt_disable();
+        cpu_mask = mm->cpu_vm_mask;
+        cpu_clear(smp_processor_id(), cpu_mask);
+        local_flush_tlb();
+        if (!cpus_empty(cpu_mask))
+                flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+        preempt_enable();
+}
+void flush_tlb_mm (struct mm_struct * mm)
+{
+        cpumask_t cpu_mask;
+        preempt_disable();
+        cpu_mask = mm->cpu_vm_mask;
+        cpu_clear(smp_processor_id(), cpu_mask);
+        if (current->active_mm == mm) {
+                if (current->mm)
+                        local_flush_tlb();
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (!cpus_empty(cpu_mask))
+                flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+        preempt_enable();
+}
+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        cpumask_t cpu_mask;
+        preempt_disable();
+        cpu_mask = mm->cpu_vm_mask;
+        cpu_clear(smp_processor_id(), cpu_mask);
+        if (current->active_mm == mm) {
+                if(current->mm)
+                        __flush_tlb_one(va);
+                 else
+                        leave_mm(smp_processor_id());
+        }
+        if (!cpus_empty(cpu_mask))
+                flush_tlb_others(cpu_mask, mm, va);
+        preempt_enable();
+}
+static void do_flush_tlb_all(void* info)
+{
+        unsigned long cpu = smp_processor_id();
+        __flush_tlb_all();
+        if (read_pda(mmu_state) == TLBSTATE_LAZY)
+                leave_mm(cpu);
+}
+void flush_tlb_all(void)
+{
+        on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+}
+void smp_kdb_stop(void)
+{
+        send_IPI_allbutself(KDB_VECTOR);
+}
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+void smp_send_reschedule(int cpu)
+{
+        send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+}
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+struct call_data_struct {
+        void (*func) (void *info);
+        void *info;
+        atomic_t started;
+        atomic_t finished;
+        int wait;
+};
+static struct call_data_struct * call_data;
+/*
+ * this function sends a 'generic call function' IPI to all other CPUs
+ * in the system.
+ */
+static void __smp_call_function (void (*func) (void *info), void *info,
+                                int nonatomic, int wait)
+{
+        struct call_data_struct data;
+        int cpus = num_online_cpus()-1;
+        if (!cpus)
+                return;
+        data.func = func;
+        data.info = info;
+        atomic_set(&data.started, 0);
+        data.wait = wait;
+        if (wait)
+                atomic_set(&data.finished, 0);
+        call_data = &data;
+        wmb();
+        /* Send a message to all other CPUs and wait for them to respond */
+        send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+        /* Wait for response */
+        while (atomic_read(&data.started) != cpus)
+                cpu_relax();
+        if (!wait)
+                return;
+        while (atomic_read(&data.finished) != cpus)
+                cpu_relax();
+}
+/*
+ * smp_call_function - run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: currently unused.
+ * @wait: If true, wait (atomically) until function has completed on other
+ *        CPUs.
+ *
+ * Returns 0 on success, else a negative status code. Does not return until
+ * remote CPUs are nearly ready to execute func or are or have executed.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ * Actually there are a few legal cases, like panic.
+ */
+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+                        int wait)
+{
+        spin_lock(&call_lock);
+        __smp_call_function(func,info,nonatomic,wait);
+        spin_unlock(&call_lock);
+        return 0;
+}
+void smp_stop_cpu(void)
+{
+        /*
+         * Remove this CPU:
+         */
+        cpu_clear(smp_processor_id(), cpu_online_map);
+        local_irq_disable();
+        disable_local_APIC();
+        local_irq_enable(); 
+}
+static void smp_really_stop_cpu(void *dummy)
+{
+        smp_stop_cpu(); 
+        for (;;) 
+                asm("hlt"); 
+} 
+void smp_send_stop(void)
+{
+        int nolock = 0;
+        if (reboot_force)
+                return;
+        /* Don't deadlock on the call lock in panic */
+        if (!spin_trylock(&call_lock)) {
+                /* ignore locking because we have paniced anyways */
+                nolock = 1;
+        }
+        __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
+        if (!nolock)
+                spin_unlock(&call_lock);
+        local_irq_disable();
+        disable_local_APIC();
+        local_irq_enable();
+}
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+asmlinkage void smp_reschedule_interrupt(void)
+{
+        ack_APIC_irq();
+}
+asmlinkage void smp_call_function_interrupt(void)
+{
+        void (*func) (void *info) = call_data->func;
+        void *info = call_data->info;
+        int wait = call_data->wait;
+        ack_APIC_irq();
+        /*
+         * Notify initiating CPU that I've grabbed the data and am
+         * about to execute the function
+         */
+        mb();
+        atomic_inc(&call_data->started);
+        /*
+         * At this point the info structure may be out of scope unless wait==1
+         */
+        irq_enter();
+        (*func)(info);
+        irq_exit();
+        if (wait) {
+                mb();
+                atomic_inc(&call_data->finished);
+        }
+}
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
new file mode 100644
index 000000000000..afd3600b919e
--- /dev/null
+++ b/arch/x86_64/kernel/smpboot.c
@@ -0,0 +1,938 @@
+/*
+ *      x86 SMP booting functions
+ *
+ *      (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *      (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *      Copyright 2001 Andi Kleen, SuSE Labs.
+ *
+ *      Much of the core SMP work is based on previous work by Thomas Radke, to
+ *      whom a great many thanks are extended.
+ *
+ *      Thanks to Intel for making available several different Pentium,
+ *      Pentium Pro and Pentium-II/Xeon MP machines.
+ *      Original development of Linux SMP code supported by Caldera.
+ *
+ *      This code is released under the GNU General Public License version 2 or
+ *      later.
+ *
+ *      Fixes
+ *              Felix Koop      :       NR_CPUS used properly
+ *              Jose Renau      :       Handle single CPU case.
+ *              Alan Cox        :       By repeated request 8) - Total BogoMIP report.
+ *              Greg Wright     :       Fix for kernel stacks panic.
+ *              Erich Boleyn    :       MP v1.4 and additional changes.
+ *      Matthias Sattler        :       Changes for 2.1 kernel map.
+ *      Michel Lespinasse       :       Changes for 2.1 kernel map.
+ *      Michael Chastain        :       Change trampoline.S to gnu as.
+ *              Alan Cox        :       Dumb bug: 'B' step PPro's are fine
+ *              Ingo Molnar     :       Added APIC timers, based on code
+ *                                      from Jose Renau
+ *              Ingo Molnar     :       various cleanups and rewrites
+ *              Tigran Aivazian :       fixed "0.00 in /proc/uptime on SMP" bug.
+ *      Maciej W. Rozycki       :       Bits for genuine 82489DX APICs
+ *      Andi Kleen              :       Changed for SMP boot into long mode.
+ *              Rusty Russell   :       Hacked into shape for new "hotplug" boot process. 
+ */
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/smp_lock.h>
+#include <linux/irq.h>
+#include <linux/bootmem.h>
+#include <linux/thread_info.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>
+#include <asm/mtrr.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#include <asm/kdebug.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+/* Package ID of each logical CPU */
+u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+EXPORT_SYMBOL(phys_proc_id);
+/* Bitmask of currently online CPUs */
+cpumask_t cpu_online_map;
+cpumask_t cpu_callin_map;
+cpumask_t cpu_callout_map;
+static cpumask_t smp_commenced_mask;
+/* Per CPU bogomips and other parameters */
+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
+/*
+ * Trampoline 80x86 program as an array.
+ */
+extern unsigned char trampoline_data [];
+extern unsigned char trampoline_end  [];
+/*
+ * Currently trivial. Write the real->protected mode
+ * bootstrap into the page concerned. The caller
+ * has made sure it's suitably aligned.
+ */
+static unsigned long __init setup_trampoline(void)
+{
+        void *tramp = __va(SMP_TRAMPOLINE_BASE); 
+        memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
+        return virt_to_phys(tramp);
+}
+/*
+ * The bootstrap kernel entry code has set these up. Save them for
+ * a given CPU
+ */
+static void __init smp_store_cpu_info(int id)
+{
+        struct cpuinfo_x86 *c = cpu_data + id;
+        *c = boot_cpu_data;
+        identify_cpu(c);
+}
+/*
+ * TSC synchronization.
+ *
+ * We first check whether all CPUs have their TSC's synchronized,
+ * then we print a warning if not, and always resync.
+ */
+static atomic_t tsc_start_flag = ATOMIC_INIT(0);
+static atomic_t tsc_count_start = ATOMIC_INIT(0);
+static atomic_t tsc_count_stop = ATOMIC_INIT(0);
+static unsigned long long tsc_values[NR_CPUS];
+#define NR_LOOPS 5
+extern unsigned int fast_gettimeoffset_quotient;
+static void __init synchronize_tsc_bp (void)
+{
+        int i;
+        unsigned long long t0;
+        unsigned long long sum, avg;
+        long long delta;
+        long one_usec;
+        int buggy = 0;
+        printk(KERN_INFO "checking TSC synchronization across %u CPUs: ",num_booting_cpus());
+        one_usec = cpu_khz; 
+        atomic_set(&tsc_start_flag, 1);
+        wmb();
+        /*
+         * We loop a few times to get a primed instruction cache,
+         * then the last pass is more or less synchronized and
+         * the BP and APs set their cycle counters to zero all at
+         * once. This reduces the chance of having random offsets
+         * between the processors, and guarantees that the maximum
+         * delay between the cycle counters is never bigger than
+         * the latency of information-passing (cachelines) between
+         * two CPUs.
+         */
+        for (i = 0; i < NR_LOOPS; i++) {
+                /*
+                 * all APs synchronize but they loop on '== num_cpus'
+                 */
+                while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) mb();
+                atomic_set(&tsc_count_stop, 0);
+                wmb();
+                /*
+                 * this lets the APs save their current TSC:
+                 */
+                atomic_inc(&tsc_count_start);
+                sync_core();
+                rdtscll(tsc_values[smp_processor_id()]);
+                /*
+                 * We clear the TSC in the last loop:
+                 */
+                if (i == NR_LOOPS-1)
+                        write_tsc(0, 0);
+                /*
+                 * Wait for all APs to leave the synchronization point:
+                 */
+                while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) mb();
+                atomic_set(&tsc_count_start, 0);
+                wmb();
+                atomic_inc(&tsc_count_stop);
+        }
+        sum = 0;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (cpu_isset(i, cpu_callout_map)) {
+                t0 = tsc_values[i];
+                sum += t0;
+        }
+        }
+        avg = sum / num_booting_cpus();
+        sum = 0;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!cpu_isset(i, cpu_callout_map))
+                        continue;
+                delta = tsc_values[i] - avg;
+                if (delta < 0)
+                        delta = -delta;
+                /*
+                 * We report bigger than 2 microseconds clock differences.
+                 */
+                if (delta > 2*one_usec) {
+                        long realdelta;
+                        if (!buggy) {
+                                buggy = 1;
+                                printk("\n");
+                        }
+                        realdelta = delta / one_usec;
+                        if (tsc_values[i] < avg)
+                                realdelta = -realdelta;
+                        printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
+                                i, realdelta);
+                }
+                sum += delta;
+        }
+        if (!buggy)
+                printk("passed.\n");
+}
+static void __init synchronize_tsc_ap (void)
+{
+        int i;
+        /*
+         * Not every cpu is online at the time
+         * this gets called, so we first wait for the BP to
+         * finish SMP initialization:
+         */
+        while (!atomic_read(&tsc_start_flag)) mb();
+        for (i = 0; i < NR_LOOPS; i++) {
+                atomic_inc(&tsc_count_start);
+                while (atomic_read(&tsc_count_start) != num_booting_cpus()) mb();
+                sync_core();
+                rdtscll(tsc_values[smp_processor_id()]);
+                if (i == NR_LOOPS-1)
+                        write_tsc(0, 0);
+                atomic_inc(&tsc_count_stop);
+                while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+        }
+}
+#undef NR_LOOPS
+static atomic_t init_deasserted;
+static void __init smp_callin(void)
+{
+        int cpuid, phys_id;
+        unsigned long timeout;
+        /*
+         * If waken up by an INIT in an 82489DX configuration
+         * we may get here before an INIT-deassert IPI reaches
+         * our local APIC.  We have to wait for the IPI or we'll
+         * lock up on an APIC access.
+         */
+        while (!atomic_read(&init_deasserted));
+        /*
+         * (This works even if the APIC is not enabled.)
+         */
+        phys_id = GET_APIC_ID(apic_read(APIC_ID));
+        cpuid = smp_processor_id();
+        if (cpu_isset(cpuid, cpu_callin_map)) {
+                panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
+                                        phys_id, cpuid);
+        }
+        Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
+        /*
+         * STARTUP IPIs are fragile beasts as they might sometimes
+         * trigger some glue motherboard logic. Complete APIC bus
+         * silence for 1 second, this overestimates the time the
+         * boot CPU is spending to send the up to 2 STARTUP IPIs
+         * by a factor of two. This should be enough.
+         */
+        /*
+         * Waiting 2s total for startup (udelay is not yet working)
+         */
+        timeout = jiffies + 2*HZ;
+        while (time_before(jiffies, timeout)) {
+                /*
+                 * Has the boot CPU finished it's STARTUP sequence?
+                 */
+                if (cpu_isset(cpuid, cpu_callout_map))
+                        break;
+                rep_nop();
+        }
+        if (!time_before(jiffies, timeout)) {
+                panic("smp_callin: CPU%d started up but did not get a callout!\n",
+                        cpuid);
+        }
+        /*
+         * the boot CPU has finished the init stage and is spinning
+         * on callin_map until we finish. We are free to set up this
+         * CPU, first the APIC. (this is probably redundant on most
+         * boards)
+         */
+        Dprintk("CALLIN, before setup_local_APIC().\n");
+        setup_local_APIC();
+        local_irq_enable();
+        /*
+         * Get our bogomips.
+         */
+        calibrate_delay();
+        Dprintk("Stack at about %p\n",&cpuid);
+        disable_APIC_timer();
+        /*
+         * Save our processor parameters
+         */
+        smp_store_cpu_info(cpuid);
+        local_irq_disable();
+        /*
+         * Allow the master to continue.
+         */
+        cpu_set(cpuid, cpu_callin_map);
+        /*
+         *      Synchronize the TSC with the BP
+         */
+        if (cpu_has_tsc)
+                synchronize_tsc_ap();
+}
+static int cpucount;
+/*
+ * Activate a secondary processor.
+ */
+void __init start_secondary(void)
+{
+        /*
+         * Dont put anything before smp_callin(), SMP
+         * booting is too fragile that we want to limit the
+         * things done here to the most necessary things.
+         */
+        cpu_init();
+        smp_callin();
+        /* otherwise gcc will move up the smp_processor_id before the cpu_init */
+        barrier();
+        Dprintk("cpu %d: waiting for commence\n", smp_processor_id()); 
+        while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
+                rep_nop();
+        Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());         
+        setup_secondary_APIC_clock();
+        Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); 
+        if (nmi_watchdog == NMI_IO_APIC) {
+                disable_8259A_irq(0);
+                enable_NMI_through_LVT0(NULL);
+                enable_8259A_irq(0);
+        }
+        enable_APIC_timer(); 
+        /*
+         * low-memory mappings have been cleared, flush them from
+         * the local TLBs too.
+         */
+        local_flush_tlb();
+        Dprintk("cpu %d eSetting cpu_online_map\n", smp_processor_id()); 
+        cpu_set(smp_processor_id(), cpu_online_map);
+        wmb();
+        
+        cpu_idle();
+}
+extern volatile unsigned long init_rsp; 
+extern void (*initial_code)(void);
+#if APIC_DEBUG
+static inline void inquire_remote_apic(int apicid)
+{
+        unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
+        char *names[] = { "ID", "VERSION", "SPIV" };
+        int timeout, status;
+        printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
+        for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
+                printk("... APIC #%d %s: ", apicid, names[i]);
+                /*
+                 * Wait for idle.
+                 */
+                apic_wait_icr_idle();
+                apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
+                apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
+                timeout = 0;
+                do {
+                        udelay(100);
+                        status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
+                } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
+                switch (status) {
+                case APIC_ICR_RR_VALID:
+                        status = apic_read(APIC_RRR);
+                        printk("%08x\n", status);
+                        break;
+                default:
+                        printk("failed\n");
+                }
+        }
+}
+#endif
+static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
+{
+        unsigned long send_status = 0, accept_status = 0;
+        int maxlvt, timeout, num_starts, j;
+        Dprintk("Asserting INIT.\n");
+        /*
+         * Turn INIT on target chip
+         */
+        apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+        /*
+         * Send IPI
+         */
+        apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
+                                | APIC_DM_INIT);
+        Dprintk("Waiting for send to finish...\n");
+        timeout = 0;
+        do {
+                Dprintk("+");
+                udelay(100);
+                send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+        } while (send_status && (timeout++ < 1000));
+        mdelay(10);
+        Dprintk("Deasserting INIT.\n");
+        /* Target chip */
+        apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+        /* Send IPI */
+        apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+        Dprintk("Waiting for send to finish...\n");
+        timeout = 0;
+        do {
+                Dprintk("+");
+                udelay(100);
+                send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+        } while (send_status && (timeout++ < 1000));
+        atomic_set(&init_deasserted, 1);
+        /*
+         * Should we send STARTUP IPIs ?
+         *
+         * Determine this based on the APIC version.
+         * If we don't have an integrated APIC, don't send the STARTUP IPIs.
+         */
+        if (APIC_INTEGRATED(apic_version[phys_apicid]))
+                num_starts = 2;
+        else
+                num_starts = 0;
+        /*
+         * Run STARTUP IPI loop.
+         */
+        Dprintk("#startup loops: %d.\n", num_starts);
+        maxlvt = get_maxlvt();
+        for (j = 1; j <= num_starts; j++) {
+                Dprintk("Sending STARTUP #%d.\n",j);
+                apic_read_around(APIC_SPIV);
+                apic_write(APIC_ESR, 0);
+                apic_read(APIC_ESR);
+                Dprintk("After apic_write.\n");
+                /*
+                 * STARTUP IPI
+                 */
+                /* Target chip */
+                apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+                /* Boot on the stack */
+                /* Kick the second */
+                apic_write_around(APIC_ICR, APIC_DM_STARTUP
+                                        | (start_rip >> 12));
+                /*
+                 * Give the other CPU some time to accept the IPI.
+                 */
+                udelay(300);
+                Dprintk("Startup point 1.\n");
+                Dprintk("Waiting for send to finish...\n");
+                timeout = 0;
+                do {
+                        Dprintk("+");
+                        udelay(100);
+                        send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+                } while (send_status && (timeout++ < 1000));
+                /*
+                 * Give the other CPU some time to accept the IPI.
+                 */
+                udelay(200);
+                /*
+                 * Due to the Pentium erratum 3AP.
+                 */
+                if (maxlvt > 3) {
+                        apic_read_around(APIC_SPIV);
+                        apic_write(APIC_ESR, 0);
+                }
+                accept_status = (apic_read(APIC_ESR) & 0xEF);
+                if (send_status || accept_status)
+                        break;
+        }
+        Dprintk("After Startup.\n");
+        if (send_status)
+                printk(KERN_ERR "APIC never delivered???\n");
+        if (accept_status)
+                printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
+        return (send_status | accept_status);
+}
+static void __init do_boot_cpu (int apicid)
+{
+        struct task_struct *idle;
+        unsigned long boot_error;
+        int timeout, cpu;
+        unsigned long start_rip;
+        cpu = ++cpucount;
+        /*
+         * We can't use kernel_thread since we must avoid to
+         * reschedule the child.
+         */
+        idle = fork_idle(cpu);
+        if (IS_ERR(idle))
+                panic("failed fork for CPU %d", cpu);
+        x86_cpu_to_apicid[cpu] = apicid;
+        cpu_pda[cpu].pcurrent = idle;
+        start_rip = setup_trampoline();
+        init_rsp = idle->thread.rsp; 
+        per_cpu(init_tss,cpu).rsp0 = init_rsp;
+        initial_code = start_secondary;
+        clear_ti_thread_flag(idle->thread_info, TIF_FORK);
+        printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, 
+               start_rip, init_rsp);
+        /*
+         * This grunge runs the startup process for
+         * the targeted processor.
+         */
+        atomic_set(&init_deasserted, 0);
+        Dprintk("Setting warm reset code and vector.\n");
+        CMOS_WRITE(0xa, 0xf);
+        local_flush_tlb();
+        Dprintk("1.\n");
+        *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
+        Dprintk("2.\n");
+        *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
+        Dprintk("3.\n");
+        /*
+         * Be paranoid about clearing APIC errors.
+         */
+        if (APIC_INTEGRATED(apic_version[apicid])) {
+                apic_read_around(APIC_SPIV);
+                apic_write(APIC_ESR, 0);
+                apic_read(APIC_ESR);
+        }
+        /*
+         * Status is now clean
+         */
+        boot_error = 0;
+        /*
+         * Starting actual IPI sequence...
+         */
+        boot_error = wakeup_secondary_via_INIT(apicid, start_rip); 
+        if (!boot_error) {
+                /*
+                 * allow APs to start initializing.
+                 */
+                Dprintk("Before Callout %d.\n", cpu);
+                cpu_set(cpu, cpu_callout_map);
+                Dprintk("After Callout %d.\n", cpu);
+                /*
+                 * Wait 5s total for a response
+                 */
+                for (timeout = 0; timeout < 50000; timeout++) {
+                        if (cpu_isset(cpu, cpu_callin_map))
+                                break;  /* It has booted */
+                        udelay(100);
+                }
+                if (cpu_isset(cpu, cpu_callin_map)) {
+                        /* number CPUs logically, starting from 1 (BSP is 0) */
+                        Dprintk("OK.\n");
+                        print_cpu_info(&cpu_data[cpu]);
+                        Dprintk("CPU has booted.\n");
+                } else {
+                        boot_error = 1;
+                        if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
+                                        == 0xA5)
+                                /* trampoline started but...? */
+                                printk("Stuck ??\n");
+                        else
+                                /* trampoline code not run */
+                                printk("Not responding.\n");
+#if APIC_DEBUG
+                        inquire_remote_apic(apicid);
+#endif
+                }
+        }
+        if (boot_error) {
+                cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
+                clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+                cpucount--;
+                x86_cpu_to_apicid[cpu] = BAD_APICID;
+                x86_cpu_to_log_apicid[cpu] = BAD_APICID;
+        }
+}
+static void smp_tune_scheduling (void)
+{
+        int cachesize;       /* kB   */
+        unsigned long bandwidth = 1000; /* MB/s */
+        /*
+         * Rough estimation for SMP scheduling, this is the number of
+         * cycles it takes for a fully memory-limited process to flush
+         * the SMP-local cache.
+         *
+         * (For a P5 this pretty much means we will choose another idle
+         *  CPU almost always at wakeup time (this is due to the small
+         *  L1 cache), on PIIs it's around 50-100 usecs, depending on
+         *  the cache size)
+         */
+        if (!cpu_khz) {
+                return;
+        } else {
+                cachesize = boot_cpu_data.x86_cache_size;
+                if (cachesize == -1) {
+                        cachesize = 16; /* Pentiums, 2x8kB cache */
+                        bandwidth = 100;
+                }
+        }
+}
+/*
+ * Cycle through the processors sending APIC IPIs to boot each.
+ */
+static void __init smp_boot_cpus(unsigned int max_cpus)
+{
+        unsigned apicid, cpu, bit, kicked;
+        nmi_watchdog_default();
+        /*
+         * Setup boot CPU information
+         */
+        smp_store_cpu_info(0); /* Final full version of the data */
+        printk(KERN_INFO "CPU%d: ", 0);
+        print_cpu_info(&cpu_data[0]);
+        current_thread_info()->cpu = 0;
+        smp_tune_scheduling();
+        if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
+                printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
+                       hard_smp_processor_id());
+                physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+        }
+        /*
+         * If we couldn't find an SMP configuration at boot time,
+         * get out of here now!
+         */
+        if (!smp_found_config) {
+                printk(KERN_NOTICE "SMP motherboard not detected.\n");
+                io_apic_irqs = 0;
+                cpu_online_map = cpumask_of_cpu(0);
+                cpu_set(0, cpu_sibling_map[0]);
+                phys_cpu_present_map = physid_mask_of_physid(0);
+                if (APIC_init_uniprocessor())
+                        printk(KERN_NOTICE "Local APIC not detected."
+                                           " Using dummy APIC emulation.\n");
+                goto smp_done;
+        }
+        /*
+         * Should not be necessary because the MP table should list the boot
+         * CPU too, but we do it for the sake of robustness anyway.
+         */
+        if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
+                printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
+                                                                 boot_cpu_id);
+                physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+        }
+        /*
+         * If we couldn't find a local APIC, then get out of here now!
+         */
+        if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) {
+                printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+                        boot_cpu_id);
+                printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
+                io_apic_irqs = 0;
+                cpu_online_map = cpumask_of_cpu(0);
+                cpu_set(0, cpu_sibling_map[0]);
+                phys_cpu_present_map = physid_mask_of_physid(0);
+                disable_apic = 1;
+                goto smp_done;
+        }
+        verify_local_APIC();
+        /*
+         * If SMP should be disabled, then really disable it!
+         */
+        if (!max_cpus) {
+                smp_found_config = 0;
+                printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
+                io_apic_irqs = 0;
+                cpu_online_map = cpumask_of_cpu(0);
+                cpu_set(0, cpu_sibling_map[0]);
+                phys_cpu_present_map = physid_mask_of_physid(0);
+                disable_apic = 1;
+                goto smp_done;
+        }
+        connect_bsp_APIC();
+        setup_local_APIC();
+        if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id)
+                BUG();
+        x86_cpu_to_apicid[0] = boot_cpu_id;
+        /*
+         * Now scan the CPU present map and fire up the other CPUs.
+         */
+        Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
+        kicked = 1;
+        for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
+                apicid = cpu_present_to_apicid(bit);
+                /*
+                 * Don't even attempt to start the boot CPU!
+                 */
+                if (apicid == boot_cpu_id || (apicid == BAD_APICID))
+                        continue;
+                if (!physid_isset(apicid, phys_cpu_present_map))
+                        continue;
+                if ((max_cpus >= 0) && (max_cpus <= cpucount+1))
+                        continue;
+                do_boot_cpu(apicid);
+                ++kicked;
+        }
+        /*
+         * Cleanup possible dangling ends...
+         */
+        {
+                /*
+                 * Install writable page 0 entry to set BIOS data area.
+                 */
+                local_flush_tlb();
+                /*
+                 * Paranoid:  Set warm reset code and vector here back
+                 * to default values.
+                 */
+                CMOS_WRITE(0, 0xf);
+                *((volatile int *) phys_to_virt(0x467)) = 0;
+        }
+        /*
+         * Allow the user to impress friends.
+         */
+        Dprintk("Before bogomips.\n");
+        if (!cpucount) {
+                printk(KERN_INFO "Only one processor found.\n");
+        } else {
+                unsigned long bogosum = 0;
+                for (cpu = 0; cpu < NR_CPUS; cpu++)
+                        if (cpu_isset(cpu, cpu_callout_map))
+                                bogosum += cpu_data[cpu].loops_per_jiffy;
+                printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+                        cpucount+1,
+                        bogosum/(500000/HZ),
+                        (bogosum/(5000/HZ))%100);
+                Dprintk("Before bogocount - setting activated=1.\n");
+        }
+        /*
+         * Construct cpu_sibling_map[], so that we can tell the
+         * sibling CPU efficiently.
+         */
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+                cpus_clear(cpu_sibling_map[cpu]);
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                int siblings = 0;
+                int i;
+                if (!cpu_isset(cpu, cpu_callout_map))
+                        continue;
+                if (smp_num_siblings > 1) {
+                        for (i = 0; i < NR_CPUS; i++) {
+                                if (!cpu_isset(i, cpu_callout_map))
+                                        continue;
+                                if (phys_proc_id[cpu] == phys_proc_id[i]) {
+                                        siblings++;
+                                        cpu_set(i, cpu_sibling_map[cpu]);
+                                }
+                        }
+                } else { 
+                        siblings++;
+                        cpu_set(cpu, cpu_sibling_map[cpu]);
+                }
+                if (siblings != smp_num_siblings) {
+                        printk(KERN_WARNING 
+               "WARNING: %d siblings found for CPU%d, should be %d\n", 
+                               siblings, cpu, smp_num_siblings);
+                        smp_num_siblings = siblings;
+                }       
+        }
+        Dprintk("Boot done.\n");
+        /*
+         * Here we can be sure that there is an IO-APIC in the system. Let's
+         * go and set it up:
+         */
+        if (!skip_ioapic_setup && nr_ioapics)
+                setup_IO_APIC();
+        else
+                nr_ioapics = 0;
+        setup_boot_APIC_clock();
+        /*
+         * Synchronize the TSC with the AP
+         */
+        if (cpu_has_tsc && cpucount)
+                synchronize_tsc_bp();
+ smp_done:
+        time_init_smp();
+}
+/* These are wrappers to interface to the new boot process.  Someone
+   who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
+void __init smp_prepare_cpus(unsigned int max_cpus)
+{
+        smp_boot_cpus(max_cpus);
+}
+void __devinit smp_prepare_boot_cpu(void)
+{
+        cpu_set(smp_processor_id(), cpu_online_map);
+        cpu_set(smp_processor_id(), cpu_callout_map);
+}
+int __devinit __cpu_up(unsigned int cpu)
+{
+        /* This only works at boot for x86.  See "rewrite" above. */
+        if (cpu_isset(cpu, smp_commenced_mask)) {
+                local_irq_enable();
+                return -ENOSYS;
+        }
+        /* In case one didn't come up */
+        if (!cpu_isset(cpu, cpu_callin_map)) {
+                local_irq_enable();
+                return -EIO;
+        }
+        local_irq_enable();
+        /* Unleash the CPU! */
+        Dprintk("waiting for cpu %d\n", cpu);
+        cpu_set(cpu, smp_commenced_mask);
+        while (!cpu_isset(cpu, cpu_online_map))
+                mb();
+        return 0;
+}
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+#ifdef CONFIG_X86_IO_APIC
+        setup_ioapic_dest();
+#endif
+        zap_low_mappings();
+}
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
new file mode 100644
index 000000000000..ebaa1e37d657
--- /dev/null
+++ b/arch/x86_64/kernel/suspend.c
@@ -0,0 +1,157 @@
+/*
+ * Suspend support specific for i386.
+ *
+ * Distribute under GPLv2
+ *
+ * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <linux/sysrq.h>
+#include <linux/proc_fs.h>
+#include <linux/irq.h>
+#include <linux/pm.h>
+#include <linux/device.h>
+#include <linux/suspend.h>
+#include <asm/uaccess.h>
+#include <asm/acpi.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+struct saved_context saved_context;
+unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx;
+unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi;
+unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11;
+unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15;
+unsigned long saved_context_eflags;
+void __save_processor_state(struct saved_context *ctxt)
+{
+        kernel_fpu_begin();
+        /*
+         * descriptor tables
+         */
+        asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
+        asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
+        asm volatile ("sldt %0" : "=m" (ctxt->ldt));
+        asm volatile ("str %0"  : "=m" (ctxt->tr));
+        /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
+        /* EFER should be constant for kernel version, no need to handle it. */
+        /*
+         * segment registers
+         */
+        asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
+        asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
+        asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
+        asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
+        asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
+        rdmsrl(MSR_FS_BASE, ctxt->fs_base);
+        rdmsrl(MSR_GS_BASE, ctxt->gs_base);
+        rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+        /*
+         * control registers 
+         */
+        asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0));
+        asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
+        asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
+        asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4));
+}
+void save_processor_state(void)
+{
+        __save_processor_state(&saved_context);
+}
+static void
+do_fpu_end(void)
+{
+        /* restore FPU regs if necessary */
+        /* Do it out of line so that gcc does not move cr0 load to some stupid place */
+        kernel_fpu_end();
+        mxcsr_feature_mask_init();
+}
+void __restore_processor_state(struct saved_context *ctxt)
+{
+        /*
+         * control registers
+         */
+        asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
+        asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
+        asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2));
+        asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0));
+        /*
+         * segment registers
+         */
+        asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
+        asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
+        asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
+        load_gs_index(ctxt->gs);
+        asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
+        wrmsrl(MSR_FS_BASE, ctxt->fs_base);
+        wrmsrl(MSR_GS_BASE, ctxt->gs_base);
+        wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+        /*
+         * now restore the descriptor tables to their proper values
+         * ltr is done i fix_processor_context().
+         */
+        asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
+        asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+        asm volatile ("lldt %0" :: "m" (ctxt->ldt));
+        fix_processor_context();
+        do_fpu_end();
+}
+void restore_processor_state(void)
+{
+        __restore_processor_state(&saved_context);
+}
+void fix_processor_context(void)
+{
+        int cpu = smp_processor_id();
+        struct tss_struct *t = &per_cpu(init_tss, cpu);
+        set_tss_desc(cpu,t);    /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
+        cpu_gdt_table[cpu][GDT_ENTRY_TSS].type = 9;
+        syscall_init();                         /* This sets MSR_*STAR and related */
+        load_TR_desc();                         /* This does ltr */
+        load_LDT(&current->active_mm->context); /* This does lldt */
+        /*
+         * Now maybe reload the debug registers
+         */
+        if (current->thread.debugreg7){
+                loaddebug(&current->thread, 0);
+                loaddebug(&current->thread, 1);
+                loaddebug(&current->thread, 2);
+                loaddebug(&current->thread, 3);
+                /* no 4 and 5 */
+                loaddebug(&current->thread, 6);
+                loaddebug(&current->thread, 7);
+        }
+}
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
new file mode 100644
index 000000000000..53f8e1659511
--- /dev/null
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -0,0 +1,104 @@
+/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * Distribute under GPLv2.
+ *
+ * swsusp_arch_resume may not use any stack, nor any variable that is
+ * not "NoSave" during copying pages:
+ *
+ * Its rewriting one kernel image with another. What is stack in "old"
+ * image could very well be data page in "new" image, and overwriting
+ * your own stack under you is bad idea.
+ */
+        
+        .text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/offset.h>
+ENTRY(swsusp_arch_suspend)
+        movq %rsp, saved_context_esp(%rip)
+        movq %rax, saved_context_eax(%rip)
+        movq %rbx, saved_context_ebx(%rip)
+        movq %rcx, saved_context_ecx(%rip)
+        movq %rdx, saved_context_edx(%rip)
+        movq %rbp, saved_context_ebp(%rip)
+        movq %rsi, saved_context_esi(%rip)
+        movq %rdi, saved_context_edi(%rip)
+        movq %r8,  saved_context_r08(%rip)
+        movq %r9,  saved_context_r09(%rip)
+        movq %r10, saved_context_r10(%rip)
+        movq %r11, saved_context_r11(%rip)
+        movq %r12, saved_context_r12(%rip)
+        movq %r13, saved_context_r13(%rip)
+        movq %r14, saved_context_r14(%rip)
+        movq %r15, saved_context_r15(%rip)
+        pushfq ; popq saved_context_eflags(%rip)
+        call swsusp_save
+        ret
+ENTRY(swsusp_arch_resume)
+        /* set up cr3 */        
+        leaq    init_level4_pgt(%rip),%rax
+        subq    $__START_KERNEL_map,%rax
+        movq    %rax,%cr3
+        movq    mmu_cr4_features(%rip), %rax
+        movq    %rax, %rdx
+        andq    $~(1<<7), %rdx  # PGE
+        movq    %rdx, %cr4;  # turn off PGE
+        movq    %cr3, %rcx;  # flush TLB
+        movq    %rcx, %cr3;
+        movq    %rax, %cr4;  # turn PGE back on
+        movq    pagedir_nosave(%rip), %rdx
+loop:
+        testq   %rdx, %rdx
+        jz      done
+        /* get addresses from the pbe and copy the page */
+        movq    pbe_address(%rdx), %rsi
+        movq    pbe_orig_address(%rdx), %rdi
+        movq    $512, %rcx
+        rep
+        movsq
+        /* progress to the next pbe */
+        movq    pbe_next(%rdx), %rdx
+        jmp     loop
+done:
+        /* Flush TLB, including "global" things (vmalloc) */
+        movq    mmu_cr4_features(%rip), %rax
+        movq    %rax, %rdx
+        andq    $~(1<<7), %rdx;  # PGE
+        movq    %rdx, %cr4;  # turn off PGE
+        movq    %cr3, %rcx;  # flush TLB
+        movq    %rcx, %cr3
+        movq    %rax, %cr4;  # turn PGE back on
+        movl    $24, %eax
+        movl    %eax, %ds
+        movq saved_context_esp(%rip), %rsp
+        movq saved_context_ebp(%rip), %rbp
+        /* Don't restore %rax, it must be 0 anyway */
+        movq saved_context_ebx(%rip), %rbx
+        movq saved_context_ecx(%rip), %rcx
+        movq saved_context_edx(%rip), %rdx
+        movq saved_context_esi(%rip), %rsi
+        movq saved_context_edi(%rip), %rdi
+        movq saved_context_r08(%rip), %r8
+        movq saved_context_r09(%rip), %r9
+        movq saved_context_r10(%rip), %r10
+        movq saved_context_r11(%rip), %r11
+        movq saved_context_r12(%rip), %r12
+        movq saved_context_r13(%rip), %r13
+        movq saved_context_r14(%rip), %r14
+        movq saved_context_r15(%rip), %r15
+        pushq saved_context_eflags(%rip) ; popfq
+        xorq    %rax, %rax
+        ret
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
new file mode 100644
index 000000000000..477d8be57d64
--- /dev/null
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -0,0 +1,173 @@
+/*
+ * linux/arch/x86_64/kernel/sys_x86_64.c
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/stat.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/personality.h>
+#include <asm/uaccess.h>
+#include <asm/ia32.h>
+/*
+ * sys_pipe() is the normal C calling standard for creating
+ * a pipe. It's not the way Unix traditionally does this, though.
+ */
+asmlinkage long sys_pipe(int __user *fildes)
+{
+        int fd[2];
+        int error;
+        error = do_pipe(fd);
+        if (!error) {
+                if (copy_to_user(fildes, fd, 2*sizeof(int)))
+                        error = -EFAULT;
+        }
+        return error;
+}
+asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
+        unsigned long fd, unsigned long off)
+{
+        long error;
+        struct file * file;
+        error = -EINVAL;
+        if (off & ~PAGE_MASK)
+                goto out;
+        error = -EBADF;
+        file = NULL;
+        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+        if (!(flags & MAP_ANONYMOUS)) {
+                file = fget(fd);
+                if (!file)
+                        goto out;
+        }
+        down_write(&current->mm->mmap_sem);
+        error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
+        up_write(&current->mm->mmap_sem);
+        if (file)
+                fput(file);
+out:
+        return error;
+}
+static void find_start_end(unsigned long flags, unsigned long *begin,
+                           unsigned long *end)
+{
+#ifdef CONFIG_IA32_EMULATION
+        if (test_thread_flag(TIF_IA32)) { 
+                *begin = TASK_UNMAPPED_32;
+                *end = IA32_PAGE_OFFSET; 
+        } else 
+#endif
+        if (flags & MAP_32BIT) { 
+                /* This is usually used needed to map code in small
+                   model, so it needs to be in the first 31bit. Limit
+                   it to that.  This means we need to move the
+                   unmapped base down for this case. This can give
+                   conflicts with the heap, but we assume that glibc
+                   malloc knows how to fall back to mmap. Give it 1GB
+                   of playground for now. -AK */ 
+                *begin = 0x40000000; 
+                *end = 0x80000000;              
+        } else { 
+                *begin = TASK_UNMAPPED_64; 
+                *end = TASK_SIZE; 
+                }
+} 
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+                unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma;
+        unsigned long start_addr;
+        unsigned long begin, end;
+        
+        find_start_end(flags, &begin, &end); 
+        if (len > end)
+                return -ENOMEM;
+        if (addr) {
+                addr = PAGE_ALIGN(addr);
+                vma = find_vma(mm, addr);
+                if (end - len >= addr &&
+                    (!vma || addr + len <= vma->vm_start))
+                        return addr;
+        }
+        addr = mm->free_area_cache;
+        if (addr < begin) 
+                addr = begin; 
+        start_addr = addr;
+full_search:
+        for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+                /* At this point:  (!vma || addr < vma->vm_end). */
+                if (end - len < addr) {
+                        /*
+                         * Start a new search - just in case we missed
+                         * some holes.
+                         */
+                        if (start_addr != begin) {
+                                start_addr = addr = begin;
+                                goto full_search;
+                        }
+                        return -ENOMEM;
+                }
+                if (!vma || addr + len <= vma->vm_start) {
+                        /*
+                         * Remember the place where we stopped the search:
+                         */
+                        mm->free_area_cache = addr + len;
+                        return addr;
+                }
+                addr = vma->vm_end;
+        }
+}
+asmlinkage long sys_uname(struct new_utsname __user * name)
+{
+        int err;
+        down_read(&uts_sem);
+        err = copy_to_user(name, &system_utsname, sizeof (*name));
+        up_read(&uts_sem);
+        if (personality(current->personality) == PER_LINUX32) 
+                err |= copy_to_user(&name->machine, "i686", 5);                 
+        return err ? -EFAULT : 0;
+}
+asmlinkage long wrap_sys_shmat(int shmid, char __user *shmaddr, int shmflg)
+{
+        unsigned long raddr;
+        return do_shmat(shmid,shmaddr,shmflg,&raddr) ?: (long)raddr;
+}
+asmlinkage long sys_time64(long __user * tloc)
+{
+        struct timeval now; 
+        int i; 
+        do_gettimeofday(&now);
+        i = now.tv_sec;
+        if (tloc) {
+                if (put_user(i,tloc))
+                        i = -EFAULT;
+        }
+        return i;
+}
diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c
new file mode 100644
index 000000000000..e263685f864c
--- /dev/null
+++ b/arch/x86_64/kernel/syscall.c
@@ -0,0 +1,26 @@
+/* System call table for x86-64. */ 
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <linux/config.h>
+#define __NO_STUBS
+#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 
+#undef _ASM_X86_64_UNISTD_H_
+#include <asm-x86_64/unistd.h>
+#undef __SYSCALL
+#define __SYSCALL(nr, sym) [ nr ] = sym, 
+#undef _ASM_X86_64_UNISTD_H_
+typedef void (*sys_call_ptr_t)(void); 
+extern void sys_ni_syscall(void);
+sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = { 
+        /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ 
+        [0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm-x86_64/unistd.h>
+};
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
new file mode 100644
index 000000000000..3bafe438fa75
--- /dev/null
+++ b/arch/x86_64/kernel/time.c
@@ -0,0 +1,1262 @@
+/*
+ *  linux/arch/x86-64/kernel/time.c
+ *
+ *  "High Precision Event Timer" based timekeeping.
+ *
+ *  Copyright (c) 1991,1992,1995  Linus Torvalds
+ *  Copyright (c) 1994  Alan Modra
+ *  Copyright (c) 1995  Markus Kuhn
+ *  Copyright (c) 1996  Ingo Molnar
+ *  Copyright (c) 1998  Andrea Arcangeli
+ *  Copyright (c) 2002  Vojtech Pavlik
+ *  Copyright (c) 2003  Andi Kleen
+ *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/irq.h>
+#include <linux/time.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sysdev.h>
+#include <linux/bcd.h>
+#include <linux/kallsyms.h>
+#include <asm/8253pit.h>
+#include <asm/pgtable.h>
+#include <asm/vsyscall.h>
+#include <asm/timex.h>
+#include <asm/proto.h>
+#include <asm/hpet.h>
+#include <asm/sections.h>
+#include <linux/cpufreq.h>
+#include <linux/hpet.h>
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/apic.h>
+#endif
+u64 jiffies_64 = INITIAL_JIFFIES;
+EXPORT_SYMBOL(jiffies_64);
+#ifdef CONFIG_CPU_FREQ
+static void cpufreq_delayed_get(void);
+#endif
+extern void i8254_timer_resume(void);
+extern int using_apic_timer;
+DEFINE_SPINLOCK(rtc_lock);
+DEFINE_SPINLOCK(i8253_lock);
+static int nohpet __initdata = 0;
+static int notsc __initdata = 0;
+#undef HPET_HACK_ENABLE_DANGEROUS
+unsigned int cpu_khz;                                   /* TSC clocks / usec, not used here */
+static unsigned long hpet_period;                       /* fsecs / HPET clock */
+unsigned long hpet_tick;                                /* HPET clocks / interrupt */
+unsigned long vxtime_hz = PIT_TICK_RATE;
+int report_lost_ticks;                          /* command line option */
+unsigned long long monotonic_base;
+struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
+struct timespec __xtime __section_xtime;
+struct timezone __sys_tz __section_sys_tz;
+static inline void rdtscll_sync(unsigned long *tsc)
+{
+#ifdef CONFIG_SMP
+        sync_core();
+#endif
+        rdtscll(*tsc);
+}
+/*
+ * do_gettimeoffset() returns microseconds since last timer interrupt was
+ * triggered by hardware. A memory read of HPET is slower than a register read
+ * of TSC, but much more reliable. It's also synchronized to the timer
+ * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a
+ * timer interrupt has happened already, but vxtime.trigger wasn't updated yet.
+ * This is not a problem, because jiffies hasn't updated either. They are bound
+ * together by xtime_lock.
+ */
+static inline unsigned int do_gettimeoffset_tsc(void)
+{
+        unsigned long t;
+        unsigned long x;
+        rdtscll_sync(&t);
+        if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */
+        x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
+        return x;
+}
+static inline unsigned int do_gettimeoffset_hpet(void)
+{
+        return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32;
+}
+unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
+/*
+ * This version of gettimeofday() has microsecond resolution and better than
+ * microsecond precision, as we're using at least a 10 MHz (usually 14.31818
+ * MHz) HPET timer.
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+        unsigned long seq, t;
+        unsigned int sec, usec;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                sec = xtime.tv_sec;
+                usec = xtime.tv_nsec / 1000;
+                /* i386 does some correction here to keep the clock 
+                   monotonous even when ntpd is fixing drift.
+                   But they didn't work for me, there is a non monotonic
+                   clock anyways with ntp.
+                   I dropped all corrections now until a real solution can
+                   be found. Note when you fix it here you need to do the same
+                   in arch/x86_64/kernel/vsyscall.c and export all needed
+                   variables in vmlinux.lds. -AK */ 
+                t = (jiffies - wall_jiffies) * (1000000L / HZ) +
+                        do_gettimeoffset();
+                usec += t;
+        } while (read_seqretry(&xtime_lock, seq));
+        tv->tv_sec = sec + usec / 1000000;
+        tv->tv_usec = usec % 1000000;
+}
+EXPORT_SYMBOL(do_gettimeofday);
+/*
+ * settimeofday() first undoes the correction that gettimeofday would do
+ * on the time, and then saves it. This is ugly, but has been like this for
+ * ages already.
+ */
+int do_settimeofday(struct timespec *tv)
+{
+        time_t wtm_sec, sec = tv->tv_sec;
+        long wtm_nsec, nsec = tv->tv_nsec;
+        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+                return -EINVAL;
+        write_seqlock_irq(&xtime_lock);
+        nsec -= do_gettimeoffset() * 1000 +
+                (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ);
+        wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+        wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+        set_normalized_timespec(&xtime, sec, nsec);
+        set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+        time_adjust = 0;                /* stop active adjtime() */
+        time_status |= STA_UNSYNC;
+        time_maxerror = NTP_PHASE_LIMIT;
+        time_esterror = NTP_PHASE_LIMIT;
+        write_sequnlock_irq(&xtime_lock);
+        clock_was_set();
+        return 0;
+}
+EXPORT_SYMBOL(do_settimeofday);
+unsigned long profile_pc(struct pt_regs *regs)
+{
+        unsigned long pc = instruction_pointer(regs);
+        /* Assume the lock function has either no stack frame or only a single word.
+           This checks if the address on the stack looks like a kernel text address.
+           There is a small window for false hits, but in that case the tick
+           is just accounted to the spinlock function.
+           Better would be to write these functions in assembler again
+           and check exactly. */
+        if (in_lock_functions(pc)) {
+                char *v = *(char **)regs->rsp;
+                if ((v >= _stext && v <= _etext) ||
+                        (v >= _sinittext && v <= _einittext) ||
+                        (v >= (char *)MODULES_VADDR  && v <= (char *)MODULES_END))
+                        return (unsigned long)v;
+                return ((unsigned long *)regs->rsp)[1];
+        }
+        return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+/*
+ * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
+ * ms after the second nowtime has started, because when nowtime is written
+ * into the registers of the CMOS clock, it will jump to the next second
+ * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
+ * sheet for details.
+ */
+static void set_rtc_mmss(unsigned long nowtime)
+{
+        int real_seconds, real_minutes, cmos_minutes;
+        unsigned char control, freq_select;
+/*
+ * IRQs are disabled when we're called from the timer interrupt,
+ * no need for spin_lock_irqsave()
+ */
+        spin_lock(&rtc_lock);
+/*
+ * Tell the clock it's being set and stop it.
+ */
+        control = CMOS_READ(RTC_CONTROL);
+        CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
+        freq_select = CMOS_READ(RTC_FREQ_SELECT);
+        CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
+        cmos_minutes = CMOS_READ(RTC_MINUTES);
+                BCD_TO_BIN(cmos_minutes);
+/*
+ * since we're only adjusting minutes and seconds, don't interfere with hour
+ * overflow. This avoids messing with unknown time zones but requires your RTC
+ * not to be off by more than 15 minutes. Since we're calling it only when
+ * our clock is externally synchronized using NTP, this shouldn't be a problem.
+ */
+        real_seconds = nowtime % 60;
+        real_minutes = nowtime / 60;
+        if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
+                real_minutes += 30;             /* correct for half hour time zone */
+        real_minutes %= 60;
+#if 0
+        /* AMD 8111 is a really bad time keeper and hits this regularly. 
+           It probably was an attempt to avoid screwing up DST, but ignore
+           that for now. */        
+        if (abs(real_minutes - cmos_minutes) >= 30) {
+                printk(KERN_WARNING "time.c: can't update CMOS clock "
+                       "from %d to %d\n", cmos_minutes, real_minutes);
+        } else
+#endif
+        {
+                        BIN_TO_BCD(real_seconds);
+                        BIN_TO_BCD(real_minutes);
+                CMOS_WRITE(real_seconds, RTC_SECONDS);
+                CMOS_WRITE(real_minutes, RTC_MINUTES);
+        }
+/*
+ * The following flags have to be released exactly in this order, otherwise the
+ * DS12887 (popular MC146818A clone with integrated battery and quartz) will
+ * not reset the oscillator and will not update precisely 500 ms later. You
+ * won't find this mentioned in the Dallas Semiconductor data sheets, but who
+ * believes data sheets anyway ... -- Markus Kuhn
+ */
+        CMOS_WRITE(control, RTC_CONTROL);
+        CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
+        spin_unlock(&rtc_lock);
+}
+/* monotonic_clock(): returns # of nanoseconds passed since time_init()
+ *              Note: This function is required to return accurate
+ *              time even in the absence of multiple timer ticks.
+ */
+unsigned long long monotonic_clock(void)
+{
+        unsigned long seq;
+        u32 last_offset, this_offset, offset;
+        unsigned long long base;
+        if (vxtime.mode == VXTIME_HPET) {
+                do {
+                        seq = read_seqbegin(&xtime_lock);
+                        last_offset = vxtime.last;
+                        base = monotonic_base;
+                        this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
+                } while (read_seqretry(&xtime_lock, seq));
+                offset = (this_offset - last_offset);
+                offset *=(NSEC_PER_SEC/HZ)/hpet_tick;
+                return base + offset;
+        }else{
+                do {
+                        seq = read_seqbegin(&xtime_lock);
+                        last_offset = vxtime.last_tsc;
+                        base = monotonic_base;
+                } while (read_seqretry(&xtime_lock, seq));
+                sync_core();
+                rdtscll(this_offset);
+                offset = (this_offset - last_offset)*1000/cpu_khz; 
+                return base + offset;
+        }
+}
+EXPORT_SYMBOL(monotonic_clock);
+static noinline void handle_lost_ticks(int lost, struct pt_regs *regs)
+{
+    static long lost_count;
+    static int warned;
+    if (report_lost_ticks) {
+            printk(KERN_WARNING "time.c: Lost %d timer "
+                   "tick(s)! ", lost);
+            print_symbol("rip %s)\n", regs->rip);
+    }
+    if (lost_count == 1000 && !warned) {
+            printk(KERN_WARNING
+                   "warning: many lost ticks.\n"
+                   KERN_WARNING "Your time source seems to be instable or "
+                                "some driver is hogging interupts\n");
+            print_symbol("rip %s\n", regs->rip);
+            if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
+                    printk(KERN_WARNING "Falling back to HPET\n");
+                    vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+                    vxtime.mode = VXTIME_HPET;
+                    do_gettimeoffset = do_gettimeoffset_hpet;
+            }
+            /* else should fall back to PIT, but code missing. */
+            warned = 1;
+    } else
+            lost_count++;
+#ifdef CONFIG_CPU_FREQ
+    /* In some cases the CPU can change frequency without us noticing
+       (like going into thermal throttle)
+       Give cpufreq a change to catch up. */
+    if ((lost_count+1) % 25 == 0) {
+            cpufreq_delayed_get();
+    }
+#endif
+}
+static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+        static unsigned long rtc_update = 0;
+        unsigned long tsc;
+        int delay, offset = 0, lost = 0;
+/*
+ * Here we are in the timer irq handler. We have irqs locally disabled (so we
+ * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
+ * on the other CPU, so we need a lock. We also need to lock the vsyscall
+ * variables, because both do_timer() and us change them -arca+vojtech
+ */
+        write_seqlock(&xtime_lock);
+        if (vxtime.hpet_address) {
+                offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
+                delay = hpet_readl(HPET_COUNTER) - offset;
+        } else {
+                spin_lock(&i8253_lock);
+                outb_p(0x00, 0x43);
+                delay = inb_p(0x40);
+                delay |= inb(0x40) << 8;
+                spin_unlock(&i8253_lock);
+                delay = LATCH - 1 - delay;
+        }
+        rdtscll_sync(&tsc);
+        if (vxtime.mode == VXTIME_HPET) {
+                if (offset - vxtime.last > hpet_tick) {
+                        lost = (offset - vxtime.last) / hpet_tick - 1;
+                }
+                monotonic_base += 
+                        (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
+                vxtime.last = offset;
+        } else {
+                offset = (((tsc - vxtime.last_tsc) *
+                           vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
+                if (offset < 0)
+                        offset = 0;
+                if (offset > (USEC_PER_SEC / HZ)) {
+                        lost = offset / (USEC_PER_SEC / HZ);
+                        offset %= (USEC_PER_SEC / HZ);
+                }
+                monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ;
+                vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
+                if ((((tsc - vxtime.last_tsc) *
+                      vxtime.tsc_quot) >> 32) < offset)
+                        vxtime.last_tsc = tsc -
+                                (((long) offset << 32) / vxtime.tsc_quot) - 1;
+        }
+        if (lost > 0) {
+                handle_lost_ticks(lost, regs);
+                jiffies += lost;
+        }
+/*
+ * Do the timer stuff.
+ */
+        do_timer(regs);
+#ifndef CONFIG_SMP
+        update_process_times(user_mode(regs));
+#endif
+/*
+ * In the SMP case we use the local APIC timer interrupt to do the profiling,
+ * except when we simulate SMP mode on a uniprocessor system, in that case we
+ * have to call the local interrupt handler.
+ */
+#ifndef CONFIG_X86_LOCAL_APIC
+        profile_tick(CPU_PROFILING, regs);
+#else
+        if (!using_apic_timer)
+                smp_local_timer_interrupt(regs);
+#endif
+/*
+ * If we have an externally synchronized Linux clock, then update CMOS clock
+ * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
+ * closest to exactly 500 ms before the next second. If the update fails, we
+ * don't care, as it'll be updated on the next turn, and the problem (time way
+ * off) isn't likely to go away much sooner anyway.
+ */
+        if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update &&
+                abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
+                set_rtc_mmss(xtime.tv_sec);
+                rtc_update = xtime.tv_sec + 660;
+        }
+ 
+        write_sequnlock(&xtime_lock);
+        return IRQ_HANDLED;
+}
+static unsigned int cyc2ns_scale;
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
+{
+        cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
+}
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+        return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+}
+unsigned long long sched_clock(void)
+{
+        unsigned long a = 0;
+#if 0
+        /* Don't do a HPET read here. Using TSC always is much faster
+           and HPET may not be mapped yet when the scheduler first runs.
+           Disadvantage is a small drift between CPUs in some configurations,
+           but that should be tolerable. */
+        if (__vxtime.mode == VXTIME_HPET)
+                return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32;
+#endif
+        /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
+           which means it is not completely exact and may not be monotonous between
+           CPUs. But the errors should be too small to matter for scheduling
+           purposes. */
+        rdtscll(a);
+        return cycles_2_ns(a);
+}
+unsigned long get_cmos_time(void)
+{
+        unsigned int timeout, year, mon, day, hour, min, sec;
+        unsigned char last, this;
+        unsigned long flags;
+/*
+ * The Linux interpretation of the CMOS clock register contents: When the
+ * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the
+ * second which has precisely just started. Waiting for this can take up to 1
+ * second, we timeout approximately after 2.4 seconds on a machine with
+ * standard 8.3 MHz ISA bus.
+ */
+        spin_lock_irqsave(&rtc_lock, flags);
+        timeout = 1000000;
+        last = this = 0;
+        while (timeout && last && !this) {
+                last = this;
+                this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP;
+                timeout--;
+        }
+/*
+ * Here we are safe to assume the registers won't change for a whole second, so
+ * we just go ahead and read them.
+         */
+                sec = CMOS_READ(RTC_SECONDS);
+                min = CMOS_READ(RTC_MINUTES);
+                hour = CMOS_READ(RTC_HOURS);
+                day = CMOS_READ(RTC_DAY_OF_MONTH);
+                mon = CMOS_READ(RTC_MONTH);
+                year = CMOS_READ(RTC_YEAR);
+        spin_unlock_irqrestore(&rtc_lock, flags);
+/*
+ * We know that x86-64 always uses BCD format, no need to check the config
+ * register.
+ */
+            BCD_TO_BIN(sec);
+            BCD_TO_BIN(min);
+            BCD_TO_BIN(hour);
+            BCD_TO_BIN(day);
+            BCD_TO_BIN(mon);
+            BCD_TO_BIN(year);
+/*
+ * x86-64 systems only exists since 2002.
+ * This will work up to Dec 31, 2100
+ */
+        year += 2000;
+        return mktime(year, mon, day, hour, min, sec);
+}
+#ifdef CONFIG_CPU_FREQ
+/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+   changes.
+   
+   RED-PEN: On SMP we assume all CPUs run with the same frequency.  It's
+   not that important because current Opteron setups do not support
+   scaling on SMP anyroads.
+   Should fix up last_tsc too. Currently gettimeofday in the
+   first tick after the change will be slightly wrong. */
+#include <linux/workqueue.h>
+static unsigned int cpufreq_delayed_issched = 0;
+static unsigned int cpufreq_init = 0;
+static struct work_struct cpufreq_delayed_get_work;
+static void handle_cpufreq_delayed_get(void *v)
+{
+        unsigned int cpu;
+        for_each_online_cpu(cpu) {
+                cpufreq_get(cpu);
+        }
+        cpufreq_delayed_issched = 0;
+}
+/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
+ * to verify the CPU frequency the timing core thinks the CPU is running
+ * at is still correct.
+ */
+static void cpufreq_delayed_get(void)
+{
+        static int warned;
+        if (cpufreq_init && !cpufreq_delayed_issched) {
+                cpufreq_delayed_issched = 1;
+                if (!warned) {
+                        warned = 1;
+                        printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
+                }
+                schedule_work(&cpufreq_delayed_get_work);
+        }
+}
+static unsigned int  ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+static unsigned long cpu_khz_ref = 0;
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+                                 void *data)
+{
+        struct cpufreq_freqs *freq = data;
+        unsigned long *lpj, dummy;
+        lpj = &dummy;
+        if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+#ifdef CONFIG_SMP
+        lpj = &cpu_data[freq->cpu].loops_per_jiffy;
+#else
+        lpj = &boot_cpu_data.loops_per_jiffy;
+#endif
+        if (!ref_freq) {
+                ref_freq = freq->old;
+                loops_per_jiffy_ref = *lpj;
+                cpu_khz_ref = cpu_khz;
+        }
+        if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+            (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+            (val == CPUFREQ_RESUMECHANGE)) {
+                *lpj =
+                cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+                cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
+                if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+                        vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+        }
+        
+        set_cyc2ns_scale(cpu_khz_ref / 1000);
+        return 0;
+}
+ 
+static struct notifier_block time_cpufreq_notifier_block = {
+         .notifier_call  = time_cpufreq_notifier
+};
+static int __init cpufreq_tsc(void)
+{
+        INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
+        if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
+                                       CPUFREQ_TRANSITION_NOTIFIER))
+                cpufreq_init = 1;
+        return 0;
+}
+core_initcall(cpufreq_tsc);
+#endif
+/*
+ * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
+ * it to the HPET timer of known frequency.
+ */
+#define TICK_COUNT 100000000
+static unsigned int __init hpet_calibrate_tsc(void)
+{
+        int tsc_start, hpet_start;
+        int tsc_now, hpet_now;
+        unsigned long flags;
+        local_irq_save(flags);
+        local_irq_disable();
+        hpet_start = hpet_readl(HPET_COUNTER);
+        rdtscl(tsc_start);
+        do {
+                local_irq_disable();
+                hpet_now = hpet_readl(HPET_COUNTER);
+                sync_core();
+                rdtscl(tsc_now);
+                local_irq_restore(flags);
+        } while ((tsc_now - tsc_start) < TICK_COUNT &&
+                 (hpet_now - hpet_start) < TICK_COUNT);
+        return (tsc_now - tsc_start) * 1000000000L
+                / ((hpet_now - hpet_start) * hpet_period / 1000);
+}
+/*
+ * pit_calibrate_tsc() uses the speaker output (channel 2) of
+ * the PIT. This is better than using the timer interrupt output,
+ * because we can read the value of the speaker with just one inb(),
+ * where we need three i/o operations for the interrupt channel.
+ * We count how many ticks the TSC does in 50 ms.
+ */
+static unsigned int __init pit_calibrate_tsc(void)
+{
+        unsigned long start, end;
+        unsigned long flags;
+        spin_lock_irqsave(&i8253_lock, flags);
+        outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+        outb(0xb0, 0x43);
+        outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
+        outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
+        rdtscll(start);
+        sync_core();
+        while ((inb(0x61) & 0x20) == 0);
+        sync_core();
+        rdtscll(end);
+        spin_unlock_irqrestore(&i8253_lock, flags);
+        
+        return (end - start) / 50;
+}
+#ifdef  CONFIG_HPET
+static __init int late_hpet_init(void)
+{
+        struct hpet_data        hd;
+        unsigned int            ntimer;
+        if (!vxtime.hpet_address)
+          return -1;
+        memset(&hd, 0, sizeof (hd));
+        ntimer = hpet_readl(HPET_ID);
+        ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+        ntimer++;
+        /*
+         * Register with driver.
+         * Timer0 and Timer1 is used by platform.
+         */
+        hd.hd_phys_address = vxtime.hpet_address;
+        hd.hd_address = (void *)fix_to_virt(FIX_HPET_BASE);
+        hd.hd_nirqs = ntimer;
+        hd.hd_flags = HPET_DATA_PLATFORM;
+        hpet_reserve_timer(&hd, 0);
+#ifdef  CONFIG_HPET_EMULATE_RTC
+        hpet_reserve_timer(&hd, 1);
+#endif
+        hd.hd_irq[0] = HPET_LEGACY_8254;
+        hd.hd_irq[1] = HPET_LEGACY_RTC;
+        if (ntimer > 2) {
+                struct hpet             *hpet;
+                struct hpet_timer       *timer;
+                int                     i;
+                hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
+                for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer;
+                     timer++, i++)
+                        hd.hd_irq[i] = (timer->hpet_config &
+                                        Tn_INT_ROUTE_CNF_MASK) >>
+                                Tn_INT_ROUTE_CNF_SHIFT;
+        }
+        hpet_alloc(&hd);
+        return 0;
+}
+fs_initcall(late_hpet_init);
+#endif
+static int hpet_timer_stop_set_go(unsigned long tick)
+{
+        unsigned int cfg;
+/*
+ * Stop the timers and reset the main counter.
+ */
+        cfg = hpet_readl(HPET_CFG);
+        cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+        hpet_writel(cfg, HPET_CFG);
+        hpet_writel(0, HPET_COUNTER);
+        hpet_writel(0, HPET_COUNTER + 4);
+/*
+ * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
+ * and period also hpet_tick.
+ */
+        hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+                    HPET_TN_32BIT, HPET_T0_CFG);
+        hpet_writel(hpet_tick, HPET_T0_CMP);
+        hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
+/*
+ * Go!
+ */
+        cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY;
+        hpet_writel(cfg, HPET_CFG);
+        return 0;
+}
+static int hpet_init(void)
+{
+        unsigned int id;
+        if (!vxtime.hpet_address)
+                return -1;
+        set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address);
+        __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+/*
+ * Read the period, compute tick and quotient.
+ */
+        id = hpet_readl(HPET_ID);
+        if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) ||
+            !(id & HPET_ID_LEGSUP))
+                return -1;
+        hpet_period = hpet_readl(HPET_PERIOD);
+        if (hpet_period < 100000 || hpet_period > 100000000)
+                return -1;
+        hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) /
+                hpet_period;
+        return hpet_timer_stop_set_go(hpet_tick);
+}
+static int hpet_reenable(void)
+{
+        return hpet_timer_stop_set_go(hpet_tick);
+}
+void __init pit_init(void)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&i8253_lock, flags);
+        outb_p(0x34, 0x43);             /* binary, mode 2, LSB/MSB, ch 0 */
+        outb_p(LATCH & 0xff, 0x40);     /* LSB */
+        outb_p(LATCH >> 8, 0x40);       /* MSB */
+        spin_unlock_irqrestore(&i8253_lock, flags);
+}
+int __init time_setup(char *str)
+{
+        report_lost_ticks = 1;
+        return 1;
+}
+static struct irqaction irq0 = {
+        timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL
+};
+extern void __init config_acpi_tables(void);
+void __init time_init(void)
+{
+        char *timename;
+#ifdef HPET_HACK_ENABLE_DANGEROUS
+        if (!vxtime.hpet_address) {
+                printk(KERN_WARNING "time.c: WARNING: Enabling HPET base "
+                       "manually!\n");
+                outl(0x800038a0, 0xcf8);
+                outl(0xff000001, 0xcfc);
+                outl(0x800038a0, 0xcf8);
+                vxtime.hpet_address = inl(0xcfc) & 0xfffffffe;
+                printk(KERN_WARNING "time.c: WARNING: Enabled HPET "
+                       "at %#lx.\n", vxtime.hpet_address);
+        }
+#endif
+        if (nohpet)
+                vxtime.hpet_address = 0;
+        xtime.tv_sec = get_cmos_time();
+        xtime.tv_nsec = 0;
+        set_normalized_timespec(&wall_to_monotonic,
+                                -xtime.tv_sec, -xtime.tv_nsec);
+        if (!hpet_init()) {
+                vxtime_hz = (1000000000000000L + hpet_period / 2) /
+                        hpet_period;
+                cpu_khz = hpet_calibrate_tsc();
+                timename = "HPET";
+        } else {
+                pit_init();
+                cpu_khz = pit_calibrate_tsc();
+                timename = "PIT";
+        }
+        printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n",
+               vxtime_hz / 1000000, vxtime_hz % 1000000, timename);
+        printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
+                cpu_khz / 1000, cpu_khz % 1000);
+        vxtime.mode = VXTIME_TSC;
+        vxtime.quot = (1000000L << 32) / vxtime_hz;
+        vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+        vxtime.hz = vxtime_hz;
+        rdtscll_sync(&vxtime.last_tsc);
+        setup_irq(0, &irq0);
+        set_cyc2ns_scale(cpu_khz / 1000);
+}
+void __init time_init_smp(void)
+{
+        char *timetype;
+        /*
+         * AMD systems with more than one CPU don't have fully synchronized
+         * TSCs. Always use HPET gettimeofday for these, although it is slower.
+         * Intel SMP systems usually have synchronized TSCs, so use always
+         * the TSC.
+         *
+         * Exceptions:
+         * IBM Summit2 checked by oem_force_hpet_timer().
+         * AMD dual core may also not need HPET. Check me.
+         *
+         * Can be turned off with "notsc".
+         */
+        if (num_online_cpus() > 1 &&
+            boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+                notsc = 1;
+        /* Some systems will want to disable TSC and use HPET. */
+        if (oem_force_hpet_timer())
+                notsc = 1;
+        if (vxtime.hpet_address && notsc) {
+                timetype = "HPET";
+                vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+                vxtime.mode = VXTIME_HPET;
+                do_gettimeoffset = do_gettimeoffset_hpet;
+        } else {
+                timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC";
+                vxtime.mode = VXTIME_TSC;
+        }
+        printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype);
+}
+__setup("report_lost_ticks", time_setup);
+static long clock_cmos_diff;
+static unsigned long sleep_start;
+static int timer_suspend(struct sys_device *dev, u32 state)
+{
+        /*
+         * Estimate time zone so that set_time can update the clock
+         */
+        long cmos_time =  get_cmos_time();
+        clock_cmos_diff = -cmos_time;
+        clock_cmos_diff += get_seconds();
+        sleep_start = cmos_time;
+        return 0;
+}
+static int timer_resume(struct sys_device *dev)
+{
+        unsigned long flags;
+        unsigned long sec;
+        unsigned long ctime = get_cmos_time();
+        unsigned long sleep_length = (ctime - sleep_start) * HZ;
+        if (vxtime.hpet_address)
+                hpet_reenable();
+        else
+                i8254_timer_resume();
+        sec = ctime + clock_cmos_diff;
+        write_seqlock_irqsave(&xtime_lock,flags);
+        xtime.tv_sec = sec;
+        xtime.tv_nsec = 0;
+        write_sequnlock_irqrestore(&xtime_lock,flags);
+        jiffies += sleep_length;
+        wall_jiffies += sleep_length;
+        return 0;
+}
+static struct sysdev_class timer_sysclass = {
+        .resume = timer_resume,
+        .suspend = timer_suspend,
+        set_kset_name("timer"),
+};
+/* XXX this driverfs stuff should probably go elsewhere later -john */
+static struct sys_device device_timer = {
+        .id     = 0,
+        .cls    = &timer_sysclass,
+};
+static int time_init_device(void)
+{
+        int error = sysdev_class_register(&timer_sysclass);
+        if (!error)
+                error = sysdev_register(&device_timer);
+        return error;
+}
+device_initcall(time_init_device);
+#ifdef CONFIG_HPET_EMULATE_RTC
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ *    is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/rtc.h>
+extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
+#define DEFAULT_RTC_INT_FREQ    64
+#define RTC_NUM_INTS            1
+static unsigned long UIE_on;
+static unsigned long prev_update_sec;
+static unsigned long AIE_on;
+static struct rtc_time alarm_time;
+static unsigned long PIE_on;
+static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
+static unsigned long PIE_count;
+static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
+int is_hpet_enabled(void)
+{
+        return vxtime.hpet_address != 0;
+}
+/*
+ * Timer 1 for RTC, we do not use periodic interrupt feature,
+ * even if HPET supports periodic interrupts on Timer 1.
+ * The reason being, to set up a periodic interrupt in HPET, we need to
+ * stop the main counter. And if we do that everytime someone diables/enables
+ * RTC, we will have adverse effect on main kernel timer running on Timer 0.
+ * So, for the time being, simulate the periodic interrupt in software.
+ *
+ * hpet_rtc_timer_init() is called for the first time and during subsequent
+ * interuppts reinit happens through hpet_rtc_timer_reinit().
+ */
+int hpet_rtc_timer_init(void)
+{
+        unsigned int cfg, cnt;
+        unsigned long flags;
+        if (!is_hpet_enabled())
+                return 0;
+        /*
+         * Set the counter 1 and enable the interrupts.
+         */
+        if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+                hpet_rtc_int_freq = PIE_freq;
+        else
+                hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+        local_irq_save(flags);
+        cnt = hpet_readl(HPET_COUNTER);
+        cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
+        hpet_writel(cnt, HPET_T1_CMP);
+        local_irq_restore(flags);
+        cfg = hpet_readl(HPET_T1_CFG);
+        cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
+        hpet_writel(cfg, HPET_T1_CFG);
+        return 1;
+}
+static void hpet_rtc_timer_reinit(void)
+{
+        unsigned int cfg, cnt;
+        if (!(PIE_on | AIE_on | UIE_on))
+                return;
+        if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+                hpet_rtc_int_freq = PIE_freq;
+        else
+                hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+        /* It is more accurate to use the comparator value than current count.*/
+        cnt = hpet_readl(HPET_T1_CMP);
+        cnt += hpet_tick*HZ/hpet_rtc_int_freq;
+        hpet_writel(cnt, HPET_T1_CMP);
+        cfg = hpet_readl(HPET_T1_CFG);
+        cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
+        hpet_writel(cfg, HPET_T1_CFG);
+        return;
+}
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+        if (!is_hpet_enabled())
+                return 0;
+        if (bit_mask & RTC_UIE)
+                UIE_on = 0;
+        if (bit_mask & RTC_PIE)
+                PIE_on = 0;
+        if (bit_mask & RTC_AIE)
+                AIE_on = 0;
+        return 1;
+}
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+        int timer_init_reqd = 0;
+        if (!is_hpet_enabled())
+                return 0;
+        if (!(PIE_on | AIE_on | UIE_on))
+                timer_init_reqd = 1;
+        if (bit_mask & RTC_UIE) {
+                UIE_on = 1;
+        }
+        if (bit_mask & RTC_PIE) {
+                PIE_on = 1;
+                PIE_count = 0;
+        }
+        if (bit_mask & RTC_AIE) {
+                AIE_on = 1;
+        }
+        if (timer_init_reqd)
+                hpet_rtc_timer_init();
+        return 1;
+}
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
+{
+        if (!is_hpet_enabled())
+                return 0;
+        alarm_time.tm_hour = hrs;
+        alarm_time.tm_min = min;
+        alarm_time.tm_sec = sec;
+        return 1;
+}
+int hpet_set_periodic_freq(unsigned long freq)
+{
+        if (!is_hpet_enabled())
+                return 0;
+        PIE_freq = freq;
+        PIE_count = 0;
+        return 1;
+}
+int hpet_rtc_dropped_irq(void)
+{
+        if (!is_hpet_enabled())
+                return 0;
+        return 1;
+}
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+        struct rtc_time curr_time;
+        unsigned long rtc_int_flag = 0;
+        int call_rtc_interrupt = 0;
+        hpet_rtc_timer_reinit();
+        if (UIE_on | AIE_on) {
+                rtc_get_rtc_time(&curr_time);
+        }
+        if (UIE_on) {
+                if (curr_time.tm_sec != prev_update_sec) {
+                        /* Set update int info, call real rtc int routine */
+                        call_rtc_interrupt = 1;
+                        rtc_int_flag = RTC_UF;
+                        prev_update_sec = curr_time.tm_sec;
+                }
+        }
+        if (PIE_on) {
+                PIE_count++;
+                if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
+                        /* Set periodic int info, call real rtc int routine */
+                        call_rtc_interrupt = 1;
+                        rtc_int_flag |= RTC_PF;
+                        PIE_count = 0;
+                }
+        }
+        if (AIE_on) {
+                if ((curr_time.tm_sec == alarm_time.tm_sec) &&
+                    (curr_time.tm_min == alarm_time.tm_min) &&
+                    (curr_time.tm_hour == alarm_time.tm_hour)) {
+                        /* Set alarm int info, call real rtc int routine */
+                        call_rtc_interrupt = 1;
+                        rtc_int_flag |= RTC_AF;
+                }
+        }
+        if (call_rtc_interrupt) {
+                rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+                rtc_interrupt(rtc_int_flag, dev_id, regs);
+        }
+        return IRQ_HANDLED;
+}
+#endif
+static int __init nohpet_setup(char *s) 
+{ 
+        nohpet = 1;
+        return 0;
+} 
+__setup("nohpet", nohpet_setup);
+static int __init notsc_setup(char *s)
+{
+        notsc = 1;
+        return 0;
+}
+__setup("notsc", notsc_setup);
diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S
new file mode 100644
index 000000000000..6d9c9a8e7d0e
--- /dev/null
+++ b/arch/x86_64/kernel/trampoline.S
@@ -0,0 +1,64 @@
+/*
+ *
+ *      Trampoline.S    Derived from Setup.S by Linus Torvalds
+ *
+ *      4 Jan 1997 Michael Chastain: changed to gnu as.
+ *
+ *      Entry: CS:IP point to the start of our code, we are 
+ *      in real mode with no stack, but the rest of the 
+ *      trampoline page to make our stack and everything else
+ *      is a mystery.
+ *
+ *      In fact we don't actually need a stack so we don't
+ *      set one up.
+ *
+ *      On entry to trampoline_data, the processor is in real mode
+ *      with 16-bit addressing and 16-bit data.  CS has some value
+ *      and IP is zero.  Thus, data addresses need to be absolute
+ *      (no relocation) and are taken with regard to r_base.
+ *
+ *      If you work on this file, check the object module with objdump
+ *      --full-contents --reloc to make sure there are no relocation
+ *      entries. For the GDT entry we do hand relocation in smpboot.c
+ *      because of 64bit linker limitations.
+ */
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+.data
+.code16
+ENTRY(trampoline_data)
+r_base = .
+        wbinvd  
+        mov     %cs, %ax        # Code and data in the same place
+        mov     %ax, %ds
+        cli                     # We should be safe anyway
+        movl    $0xA5A5A5A5, trampoline_data - r_base
+                                # write marker for master knows we're running
+        lidt    idt_48 - r_base # load idt with 0, 0
+        lgdt    gdt_48 - r_base # load gdt with whatever is appropriate
+        xor     %ax, %ax
+        inc     %ax             # protected mode (PE) bit
+        lmsw    %ax             # into protected mode
+        # flaush prefetch and jump to startup_32 in arch/x86_64/kernel/head.S
+        ljmpl   $__KERNEL32_CS, $(startup_32-__START_KERNEL_map)
+        # Careful these need to be in the same 64K segment as the above;
+idt_48:
+        .word   0                       # idt limit = 0
+        .word   0, 0                    # idt base = 0L
+gdt_48:
+        .short  __KERNEL32_CS + 7       # gdt limit
+        .long   cpu_gdt_table-__START_KERNEL_map
+.globl trampoline_end
+trampoline_end: 
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
new file mode 100644
index 000000000000..d87ac6496cc2
--- /dev/null
+++ b/arch/x86_64/kernel/traps.c
@@ -0,0 +1,948 @@
+/*
+ *  linux/arch/x86-64/traps.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ *
+ *  Pentium III FXSR, SSE support
+ *      Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ *  $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $
+ */
+/*
+ * 'Traps.c' handles hardware traps and faults after we have saved some
+ * state in 'entry.S'.
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/debugreg.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/kdebug.h>
+#include <asm/processor.h>
+#include <asm/smp.h>
+#include <asm/pgalloc.h>
+#include <asm/pda.h>
+#include <asm/proto.h>
+#include <asm/nmi.h>
+#include <linux/irq.h>
+extern struct gate_struct idt_table[256]; 
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+asmlinkage void double_fault(void);
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void simd_coprocessor_error(void);
+asmlinkage void reserved(void);
+asmlinkage void alignment_check(void);
+asmlinkage void machine_check(void);
+asmlinkage void spurious_interrupt_bug(void);
+asmlinkage void call_debug(void);
+struct notifier_block *die_chain;
+static DEFINE_SPINLOCK(die_notifier_lock);
+int register_die_notifier(struct notifier_block *nb)
+{
+        int err = 0;
+        unsigned long flags;
+        spin_lock_irqsave(&die_notifier_lock, flags);
+        err = notifier_chain_register(&die_chain, nb);
+        spin_unlock_irqrestore(&die_notifier_lock, flags);
+        return err;
+}
+static inline void conditional_sti(struct pt_regs *regs)
+{
+        if (regs->eflags & X86_EFLAGS_IF)
+                local_irq_enable();
+}
+static int kstack_depth_to_print = 10;
+#ifdef CONFIG_KALLSYMS
+#include <linux/kallsyms.h> 
+int printk_address(unsigned long address)
+{ 
+        unsigned long offset = 0, symsize;
+        const char *symname;
+        char *modname;
+        char *delim = ":"; 
+        char namebuf[128];
+        symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); 
+        if (!symname) 
+                return printk("[<%016lx>]", address);
+        if (!modname) 
+                modname = delim = "";           
+        return printk("<%016lx>{%s%s%s%s%+ld}",
+                      address,delim,modname,delim,symname,offset); 
+} 
+#else
+int printk_address(unsigned long address)
+{ 
+        return printk("[<%016lx>]", address);
+} 
+#endif
+unsigned long *in_exception_stack(int cpu, unsigned long stack) 
+{ 
+        int k;
+        for (k = 0; k < N_EXCEPTION_STACKS; k++) {
+                struct tss_struct *tss = &per_cpu(init_tss, cpu);
+                unsigned long start = tss->ist[k] - EXCEPTION_STKSZ;
+                if (stack >= start && stack < tss->ist[k])
+                        return (unsigned long *)tss->ist[k];
+        }
+        return NULL;
+} 
+/*
+ * x86-64 can have upto three kernel stacks: 
+ * process stack
+ * interrupt stack
+ * severe exception (double fault, nmi, stack fault) hardware stack
+ * Check and process them in order.
+ */
+void show_trace(unsigned long *stack)
+{
+        unsigned long addr;
+        unsigned long *irqstack, *irqstack_end, *estack_end;
+        const int cpu = safe_smp_processor_id();
+        int i;
+        printk("\nCall Trace:");
+        i = 0; 
+        
+        estack_end = in_exception_stack(cpu, (unsigned long)stack); 
+        if (estack_end) { 
+                while (stack < estack_end) { 
+                        addr = *stack++; 
+                        if (__kernel_text_address(addr)) {
+                                i += printk_address(addr);
+                                i += printk(" "); 
+                                if (i > 50) {
+                                        printk("\n"); 
+                                        i = 0;
+                                }
+                        }
+                }
+                i += printk(" <EOE> "); 
+                i += 7;
+                stack = (unsigned long *) estack_end[-2]; 
+        }  
+        irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr);
+        irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE + 64);
+        if (stack >= irqstack && stack < irqstack_end) {
+                printk("<IRQ> ");  
+                while (stack < irqstack_end) {
+                        addr = *stack++;
+                        /*
+                         * If the address is either in the text segment of the
+                         * kernel, or in the region which contains vmalloc'ed
+                         * memory, it *may* be the address of a calling
+                         * routine; if so, print it so that someone tracing
+                         * down the cause of the crash will be able to figure
+                         * out the call path that was taken.
+                         */
+                         if (__kernel_text_address(addr)) {
+                                 i += printk_address(addr);
+                                 i += printk(" "); 
+                                 if (i > 50) { 
+                                        printk("\n       ");
+                                         i = 0;
+                                 } 
+                        }
+                } 
+                stack = (unsigned long *) (irqstack_end[-1]);
+                printk(" <EOI> ");
+                i += 7;
+        } 
+        while (((long) stack & (THREAD_SIZE-1)) != 0) {
+                addr = *stack++;
+                if (__kernel_text_address(addr)) {
+                        i += printk_address(addr);
+                        i += printk(" "); 
+                        if (i > 50) { 
+                                printk("\n       ");
+                                         i = 0;
+                        } 
+                }
+        }
+        printk("\n");
+}
+void show_stack(struct task_struct *tsk, unsigned long * rsp)
+{
+        unsigned long *stack;
+        int i;
+        const int cpu = safe_smp_processor_id();
+        unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr);
+        unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE);    
+        // debugging aid: "show_stack(NULL, NULL);" prints the
+        // back trace for this cpu.
+        if (rsp == NULL) {
+                if (tsk)
+                        rsp = (unsigned long *)tsk->thread.rsp;
+                else
+                        rsp = (unsigned long *)&rsp;
+        }
+        stack = rsp;
+        for(i=0; i < kstack_depth_to_print; i++) {
+                if (stack >= irqstack && stack <= irqstack_end) {
+                        if (stack == irqstack_end) {
+                                stack = (unsigned long *) (irqstack_end[-1]);
+                                printk(" <EOI> ");
+                        }
+                } else {
+                if (((long) stack & (THREAD_SIZE-1)) == 0)
+                        break;
+                }
+                if (i && ((i % 4) == 0))
+                        printk("\n       ");
+                printk("%016lx ", *stack++);
+        }
+        show_trace((unsigned long *)rsp);
+}
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+        unsigned long dummy;
+        show_trace(&dummy);
+}
+EXPORT_SYMBOL(dump_stack);
+void show_registers(struct pt_regs *regs)
+{
+        int i;
+        int in_kernel = (regs->cs & 3) == 0;
+        unsigned long rsp;
+        const int cpu = safe_smp_processor_id(); 
+        struct task_struct *cur = cpu_pda[cpu].pcurrent; 
+                rsp = regs->rsp;
+        printk("CPU %d ", cpu);
+        __show_regs(regs);
+        printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
+                cur->comm, cur->pid, cur->thread_info, cur);
+        /*
+         * When in-kernel, we also print out the stack and code at the
+         * time of the fault..
+         */
+        if (in_kernel) {
+                printk("Stack: ");
+                show_stack(NULL, (unsigned long*)rsp);
+                printk("\nCode: ");
+                if(regs->rip < PAGE_OFFSET)
+                        goto bad;
+                for(i=0;i<20;i++)
+                {
+                        unsigned char c;
+                        if(__get_user(c, &((unsigned char*)regs->rip)[i])) {
+bad:
+                                printk(" Bad RIP value.");
+                                break;
+                        }
+                        printk("%02x ", c);
+                }
+        }
+        printk("\n");
+}       
+void handle_BUG(struct pt_regs *regs)
+{ 
+        struct bug_frame f;
+        char tmp;
+        if (regs->cs & 3)
+                return; 
+        if (__copy_from_user(&f, (struct bug_frame *) regs->rip, 
+                             sizeof(struct bug_frame)))
+                return; 
+        if ((unsigned long)f.filename < __PAGE_OFFSET || 
+            f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) 
+                return;
+        if (__get_user(tmp, f.filename))
+                f.filename = "unmapped filename"; 
+        printk("----------- [cut here ] --------- [please bite here ] ---------\n");
+        printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line);
+} 
+void out_of_line_bug(void)
+{ 
+        BUG(); 
+} 
+static DEFINE_SPINLOCK(die_lock);
+static int die_owner = -1;
+void oops_begin(void)
+{
+        int cpu = safe_smp_processor_id(); 
+        /* racy, but better than risking deadlock. */ 
+        local_irq_disable();
+        if (!spin_trylock(&die_lock)) { 
+                if (cpu == die_owner) 
+                        /* nested oops. should stop eventually */;
+                else
+                        spin_lock(&die_lock); 
+        }
+        die_owner = cpu; 
+        console_verbose();
+        bust_spinlocks(1); 
+}
+void oops_end(void)
+{ 
+        die_owner = -1;
+        bust_spinlocks(0); 
+        spin_unlock(&die_lock); 
+        if (panic_on_oops)
+                panic("Oops"); 
+} 
+void __die(const char * str, struct pt_regs * regs, long err)
+{
+        static int die_counter;
+        printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
+#ifdef CONFIG_PREEMPT
+        printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+        printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+        printk("DEBUG_PAGEALLOC");
+#endif
+        printk("\n");
+        notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
+        show_registers(regs);
+        /* Executive summary in case the oops scrolled away */
+        printk(KERN_ALERT "RIP ");
+        printk_address(regs->rip); 
+        printk(" RSP <%016lx>\n", regs->rsp); 
+}
+void die(const char * str, struct pt_regs * regs, long err)
+{
+        oops_begin();
+        handle_BUG(regs);
+        __die(str, regs, err);
+        oops_end();
+        do_exit(SIGSEGV); 
+}
+static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
+{
+        if (!(regs->eflags & VM_MASK) && (regs->cs == __KERNEL_CS))
+                die(str, regs, err);
+}
+void die_nmi(char *str, struct pt_regs *regs)
+{
+        oops_begin();
+        /*
+         * We are in trouble anyway, lets at least try
+         * to get a message out.
+         */
+        printk(str, safe_smp_processor_id());
+        show_registers(regs);
+        if (panic_on_timeout || panic_on_oops)
+                panic("nmi watchdog");
+        printk("console shuts up ...\n");
+        oops_end();
+        do_exit(SIGSEGV);
+}
+static void do_trap(int trapnr, int signr, char *str, 
+                           struct pt_regs * regs, long error_code, siginfo_t *info)
+{
+        conditional_sti(regs);
+#ifdef CONFIG_CHECKING
+       { 
+               unsigned long gs; 
+               struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); 
+               rdmsrl(MSR_GS_BASE, gs); 
+               if (gs != (unsigned long)pda) { 
+                       wrmsrl(MSR_GS_BASE, pda); 
+                       printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda,
+                              regs->rip);
+               }
+       }
+#endif
+        if ((regs->cs & 3)  != 0) { 
+                struct task_struct *tsk = current;
+                if (exception_trace && unhandled_signal(tsk, signr))
+                        printk(KERN_INFO
+                               "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
+                               tsk->comm, tsk->pid, str,
+                               regs->rip,regs->rsp,error_code); 
+                tsk->thread.error_code = error_code;
+                tsk->thread.trap_no = trapnr;
+                if (info)
+                        force_sig_info(signr, info, tsk);
+                else
+                        force_sig(signr, tsk);
+                return;
+        }
+        /* kernel trap */ 
+        {            
+                const struct exception_table_entry *fixup;
+                fixup = search_exception_tables(regs->rip);
+                if (fixup) {
+                        regs->rip = fixup->fixup;
+                } else  
+                        die(str, regs, error_code);
+                return;
+        }
+}
+#define DO_ERROR(trapnr, signr, str, name) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                                        == NOTIFY_STOP) \
+                return; \
+        do_trap(trapnr, signr, str, regs, error_code, NULL); \
+}
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+        siginfo_t info; \
+        info.si_signo = signr; \
+        info.si_errno = 0; \
+        info.si_code = sicode; \
+        info.si_addr = (void __user *)siaddr; \
+        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                                        == NOTIFY_STOP) \
+                return; \
+        do_trap(trapnr, signr, str, regs, error_code, &info); \
+}
+DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
+DO_ERROR( 4, SIGSEGV, "overflow", overflow)
+DO_ERROR( 5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO( 6, SIGILL,  "invalid operand", invalid_op, ILL_ILLOPN, regs->rip)
+DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
+DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR(18, SIGSEGV, "reserved", reserved)
+#define DO_ERROR_STACK(trapnr, signr, str, name) \
+asmlinkage void *do_##name(struct pt_regs * regs, long error_code) \
+{ \
+        struct pt_regs *pr = ((struct pt_regs *)(current->thread.rsp0))-1; \
+        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                                        == NOTIFY_STOP) \
+                return regs; \
+        if (regs->cs & 3) { \
+                memcpy(pr, regs, sizeof(struct pt_regs)); \
+                regs = pr; \
+        } \
+        do_trap(trapnr, signr, str, regs, error_code, NULL); \
+        return regs;            \
+}
+DO_ERROR_STACK(12, SIGBUS,  "stack segment", stack_segment)
+DO_ERROR_STACK( 8, SIGSEGV, "double fault", double_fault)
+asmlinkage void do_general_protection(struct pt_regs * regs, long error_code)
+{
+        conditional_sti(regs);
+#ifdef CONFIG_CHECKING
+       { 
+               unsigned long gs; 
+               struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); 
+               rdmsrl(MSR_GS_BASE, gs); 
+               if (gs != (unsigned long)pda) { 
+                       wrmsrl(MSR_GS_BASE, pda); 
+                       oops_in_progress++;
+                       printk("general protection handler: wrong gs %lx expected %p\n", gs, pda);
+                       oops_in_progress--;
+               }
+       }
+#endif
+        if ((regs->cs & 3)!=0) { 
+                struct task_struct *tsk = current;
+                if (exception_trace && unhandled_signal(tsk, SIGSEGV))
+                        printk(KERN_INFO
+                       "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
+                               tsk->comm, tsk->pid,
+                               regs->rip,regs->rsp,error_code); 
+                tsk->thread.error_code = error_code;
+                tsk->thread.trap_no = 13;
+                force_sig(SIGSEGV, tsk);
+                return;
+        } 
+        /* kernel gp */
+        {
+                const struct exception_table_entry *fixup;
+                fixup = search_exception_tables(regs->rip);
+                if (fixup) {
+                        regs->rip = fixup->fixup;
+                        return;
+                }
+                if (notify_die(DIE_GPF, "general protection fault", regs,
+                                        error_code, 13, SIGSEGV) == NOTIFY_STOP)
+                        return;
+                die("general protection fault", regs, error_code);
+        }
+}
+static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
+{
+        printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
+        printk("You probably have a hardware problem with your RAM chips\n");
+        /* Clear and disable the memory parity error line. */
+        reason = (reason & 0xf) | 4;
+        outb(reason, 0x61);
+}
+static void io_check_error(unsigned char reason, struct pt_regs * regs)
+{
+        printk("NMI: IOCK error (debug interrupt?)\n");
+        show_registers(regs);
+        /* Re-enable the IOCK line, wait for a few seconds */
+        reason = (reason & 0xf) | 8;
+        outb(reason, 0x61);
+        mdelay(2000);
+        reason &= ~8;
+        outb(reason, 0x61);
+}
+static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+{       printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
+        printk("Dazed and confused, but trying to continue\n");
+        printk("Do you have a strange power saving mode enabled?\n");
+}
+asmlinkage void default_do_nmi(struct pt_regs *regs)
+{
+        unsigned char reason = 0;
+        /* Only the BSP gets external NMIs from the system.  */
+        if (!smp_processor_id())
+                reason = get_nmi_reason();
+        if (!(reason & 0xc0)) {
+                if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
+                                                                == NOTIFY_STOP)
+                        return;
+#ifdef CONFIG_X86_LOCAL_APIC
+                /*
+                 * Ok, so this is none of the documented NMI sources,
+                 * so it must be the NMI watchdog.
+                 */
+                if (nmi_watchdog > 0) {
+                        nmi_watchdog_tick(regs,reason);
+                        return;
+                }
+#endif
+                unknown_nmi_error(reason, regs);
+                return;
+        }
+        if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP)
+                return; 
+        /* AK: following checks seem to be broken on modern chipsets. FIXME */
+        if (reason & 0x80)
+                mem_parity_error(reason, regs);
+        if (reason & 0x40)
+                io_check_error(reason, regs);
+}
+asmlinkage void do_int3(struct pt_regs * regs, long error_code)
+{
+        if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
+                return;
+        }
+        do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
+        return;
+}
+/* runs on IST stack. */
+asmlinkage void *do_debug(struct pt_regs * regs, unsigned long error_code)
+{
+        struct pt_regs *pr;
+        unsigned long condition;
+        struct task_struct *tsk = current;
+        siginfo_t info;
+        pr = (struct pt_regs *)(current->thread.rsp0)-1;
+        if (regs->cs & 3) {
+                memcpy(pr, regs, sizeof(struct pt_regs));
+                regs = pr;
+        }       
+#ifdef CONFIG_CHECKING
+       { 
+               /* RED-PEN interaction with debugger - could destroy gs */
+               unsigned long gs; 
+               struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); 
+               rdmsrl(MSR_GS_BASE, gs); 
+               if (gs != (unsigned long)pda) { 
+                       wrmsrl(MSR_GS_BASE, pda); 
+                       printk("debug handler: wrong gs %lx expected %p\n", gs, pda);
+               }
+       }
+#endif
+        asm("movq %%db6,%0" : "=r" (condition));
+        if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+                                                SIGTRAP) == NOTIFY_STOP) {
+                return regs;
+        }
+        conditional_sti(regs);
+        /* Mask out spurious debug traps due to lazy DR7 setting */
+        if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
+                if (!tsk->thread.debugreg7) { 
+                        goto clear_dr7;
+                }
+        }
+        tsk->thread.debugreg6 = condition;
+        /* Mask out spurious TF errors due to lazy TF clearing */
+        if ((condition & DR_STEP) &&
+            (notify_die(DIE_DEBUGSTEP, "debugstep", regs, condition,
+                        1, SIGTRAP) != NOTIFY_STOP)) {
+                /*
+                 * The TF error should be masked out only if the current
+                 * process is not traced and if the TRAP flag has been set
+                 * previously by a tracing process (condition detected by
+                 * the PT_DTRACE flag); remember that the i386 TRAP flag
+                 * can be modified by the process itself in user mode,
+                 * allowing programs to debug themselves without the ptrace()
+                 * interface.
+                 */
+                if ((regs->cs & 3) == 0)
+                       goto clear_TF_reenable;
+                if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE)
+                        goto clear_TF;
+        }
+        /* Ok, finally something we can handle */
+        tsk->thread.trap_no = 1;
+        tsk->thread.error_code = error_code;
+        info.si_signo = SIGTRAP;
+        info.si_errno = 0;
+        info.si_code = TRAP_BRKPT;
+        if ((regs->cs & 3) == 0) 
+                goto clear_dr7; 
+        info.si_addr = (void __user *)regs->rip;
+        force_sig_info(SIGTRAP, &info, tsk);    
+clear_dr7:
+        asm volatile("movq %0,%%db7"::"r"(0UL));
+        notify_die(DIE_DEBUG, "debug", regs, condition, 1, SIGTRAP);
+        return regs;
+clear_TF_reenable:
+        set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+clear_TF:
+        /* RED-PEN could cause spurious errors */
+        if (notify_die(DIE_DEBUG, "debug2", regs, condition, 1, SIGTRAP) 
+                                                                != NOTIFY_STOP)
+        regs->eflags &= ~TF_MASK;
+        return regs;    
+}
+static int kernel_math_error(struct pt_regs *regs, char *str)
+{
+        const struct exception_table_entry *fixup;
+        fixup = search_exception_tables(regs->rip);
+        if (fixup) {
+                regs->rip = fixup->fixup;
+                return 1;
+        }
+        notify_die(DIE_GPF, str, regs, 0, 16, SIGFPE);
+#if 0
+        /* This should be a die, but warn only for now */
+        die(str, regs, 0);
+#else
+        printk(KERN_DEBUG "%s: %s at ", current->comm, str);
+        printk_address(regs->rip);
+        printk("\n");
+#endif
+        return 0;
+}
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+asmlinkage void do_coprocessor_error(struct pt_regs *regs)
+{
+        void __user *rip = (void __user *)(regs->rip);
+        struct task_struct * task;
+        siginfo_t info;
+        unsigned short cwd, swd;
+        conditional_sti(regs);
+        if ((regs->cs & 3) == 0 &&
+            kernel_math_error(regs, "kernel x87 math error"))
+                return;
+        /*
+         * Save the info for the exception handler and clear the error.
+         */
+        task = current;
+        save_init_fpu(task);
+        task->thread.trap_no = 16;
+        task->thread.error_code = 0;
+        info.si_signo = SIGFPE;
+        info.si_errno = 0;
+        info.si_code = __SI_FAULT;
+        info.si_addr = rip;
+        /*
+         * (~cwd & swd) will mask out exceptions that are not set to unmasked
+         * status.  0x3f is the exception bits in these regs, 0x200 is the
+         * C1 reg you need in case of a stack fault, 0x040 is the stack
+         * fault bit.  We should only be taking one exception at a time,
+         * so if this combination doesn't produce any single exception,
+         * then we have a bad program that isn't synchronizing its FPU usage
+         * and it will suffer the consequences since we won't be able to
+         * fully reproduce the context of the exception
+         */
+        cwd = get_fpu_cwd(task);
+        swd = get_fpu_swd(task);
+        switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) {
+                case 0x000:
+                default:
+                        break;
+                case 0x001: /* Invalid Op */
+                case 0x041: /* Stack Fault */
+                case 0x241: /* Stack Fault | Direction */
+                        info.si_code = FPE_FLTINV;
+                        break;
+                case 0x002: /* Denormalize */
+                case 0x010: /* Underflow */
+                        info.si_code = FPE_FLTUND;
+                        break;
+                case 0x004: /* Zero Divide */
+                        info.si_code = FPE_FLTDIV;
+                        break;
+                case 0x008: /* Overflow */
+                        info.si_code = FPE_FLTOVF;
+                        break;
+                case 0x020: /* Precision */
+                        info.si_code = FPE_FLTRES;
+                        break;
+        }
+        force_sig_info(SIGFPE, &info, task);
+}
+asmlinkage void bad_intr(void)
+{
+        printk("bad interrupt"); 
+}
+asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
+{
+        void __user *rip = (void __user *)(regs->rip);
+        struct task_struct * task;
+        siginfo_t info;
+        unsigned short mxcsr;
+        conditional_sti(regs);
+        if ((regs->cs & 3) == 0 &&
+                kernel_math_error(regs, "simd math error"))
+                return;
+        /*
+         * Save the info for the exception handler and clear the error.
+         */
+        task = current;
+        save_init_fpu(task);
+        task->thread.trap_no = 19;
+        task->thread.error_code = 0;
+        info.si_signo = SIGFPE;
+        info.si_errno = 0;
+        info.si_code = __SI_FAULT;
+        info.si_addr = rip;
+        /*
+         * The SIMD FPU exceptions are handled a little differently, as there
+         * is only a single status/control register.  Thus, to determine which
+         * unmasked exception was caught we must mask the exception mask bits
+         * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+         */
+        mxcsr = get_fpu_mxcsr(task);
+        switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
+                case 0x000:
+                default:
+                        break;
+                case 0x001: /* Invalid Op */
+                        info.si_code = FPE_FLTINV;
+                        break;
+                case 0x002: /* Denormalize */
+                case 0x010: /* Underflow */
+                        info.si_code = FPE_FLTUND;
+                        break;
+                case 0x004: /* Zero Divide */
+                        info.si_code = FPE_FLTDIV;
+                        break;
+                case 0x008: /* Overflow */
+                        info.si_code = FPE_FLTOVF;
+                        break;
+                case 0x020: /* Precision */
+                        info.si_code = FPE_FLTRES;
+                        break;
+        }
+        force_sig_info(SIGFPE, &info, task);
+}
+asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
+{
+}
+asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+{
+}
+/*
+ *  'math_state_restore()' saves the current math information in the
+ * old math state array, and gets the new ones from the current task
+ *
+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
+ * Don't touch unless you *really* know how it works.
+ */
+asmlinkage void math_state_restore(void)
+{
+        struct task_struct *me = current;
+        clts();                 /* Allow maths ops (or we recurse) */
+        if (!used_math())
+                init_fpu(me);
+        restore_fpu_checking(&me->thread.i387.fxsave);
+        me->thread_info->status |= TS_USEDFPU;
+}
+void do_call_debug(struct pt_regs *regs) 
+{ 
+        notify_die(DIE_CALL, "debug call", regs, 0, 255, SIGINT); 
+}
+void __init trap_init(void)
+{
+        set_intr_gate(0,&divide_error);
+        set_intr_gate_ist(1,&debug,DEBUG_STACK);
+        set_intr_gate_ist(2,&nmi,NMI_STACK);
+        set_system_gate(3,&int3);
+        set_system_gate(4,&overflow);   /* int4-5 can be called from all */
+        set_system_gate(5,&bounds);
+        set_intr_gate(6,&invalid_op);
+        set_intr_gate(7,&device_not_available);
+        set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
+        set_intr_gate(9,&coprocessor_segment_overrun);
+        set_intr_gate(10,&invalid_TSS);
+        set_intr_gate(11,&segment_not_present);
+        set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK);
+        set_intr_gate(13,&general_protection);
+        set_intr_gate(14,&page_fault);
+        set_intr_gate(15,&spurious_interrupt_bug);
+        set_intr_gate(16,&coprocessor_error);
+        set_intr_gate(17,&alignment_check);
+#ifdef CONFIG_X86_MCE
+        set_intr_gate_ist(18,&machine_check, MCE_STACK); 
+#endif
+        set_intr_gate(19,&simd_coprocessor_error);
+#ifdef CONFIG_IA32_EMULATION
+        set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+#endif
+       
+        set_intr_gate(KDB_VECTOR, call_debug);
+       
+        /*
+         * Should be a barrier for any external CPU state.
+         */
+        cpu_init();
+}
+/* Actual parsing is done early in setup.c. */
+static int __init oops_dummy(char *s)
+{ 
+        panic_on_oops = 1;
+        return -1; 
+} 
+__setup("oops=", oops_dummy); 
+static int __init kstack_setup(char *s)
+{
+        kstack_depth_to_print = simple_strtoul(s,NULL,0);
+        return 0;
+}
+__setup("kstack=", kstack_setup);
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
new file mode 100644
index 000000000000..59ebd5beda87
--- /dev/null
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -0,0 +1,164 @@
+/* ld script to make x86-64 Linux kernel
+ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
+ */
+#include <asm-generic/vmlinux.lds.h>
+#include <linux/config.h>
+OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(phys_startup_64)
+jiffies_64 = jiffies;
+SECTIONS
+{
+  . = 0xffffffff80100000;
+  phys_startup_64 = startup_64 - LOAD_OFFSET;
+  _text = .;                    /* Text and read-only data */
+  .text : {
+        *(.text)
+        SCHED_TEXT
+        LOCK_TEXT
+        *(.fixup)
+        *(.gnu.warning)
+        } = 0x9090
+  .text.lock : { *(.text.lock) }        /* out-of-line lock text */
+  _etext = .;                   /* End of text section */
+  . = ALIGN(16);                /* Exception table */
+  __start___ex_table = .;
+  __ex_table : { *(__ex_table) }
+  __stop___ex_table = .;
+  RODATA
+  .data : {                     /* Data */
+        *(.data)
+        CONSTRUCTORS
+        }
+  _edata = .;                   /* End of data section */
+  __bss_start = .;              /* BSS */
+  .bss : {
+        *(.bss.page_aligned)    
+        *(.bss)
+        }
+  __bss_end = .;
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+#define AFTER(x)      BINALIGN(LOADADDR(x) + SIZEOF(x), 16)
+#define BINALIGN(x,y) (((x) + (y) - 1)  & ~((y) - 1))
+#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES)
+  .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
+  __vsyscall_0 = LOADADDR(.vsyscall_0);
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) }
+  xtime_lock = LOADADDR(.xtime_lock);
+  .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) }
+  vxtime = LOADADDR(.vxtime);
+  .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) }
+  wall_jiffies = LOADADDR(.wall_jiffies);
+  .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) }
+  sys_tz = LOADADDR(.sys_tz);
+  .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) }
+  sysctl_vsyscall = LOADADDR(.sysctl_vsyscall); 
+  .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) }
+  xtime = LOADADDR(.xtime);
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) }
+  jiffies = LOADADDR(.jiffies);
+  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) }
+  . = LOADADDR(.vsyscall_0) + 4096;
+  . = ALIGN(8192);              /* init_task */
+  .data.init_task : { *(.data.init_task) }
+  . = ALIGN(4096);
+  .data.page_aligned : { *(.data.page_aligned) }
+  . = ALIGN(4096);              /* Init code and data */
+  __init_begin = .;
+  .init.text : { 
+        _sinittext = .;
+        *(.init.text)
+        _einittext = .;
+  }
+  __initdata_begin = .;
+  .init.data : { *(.init.data) }
+  __initdata_end = .;
+  . = ALIGN(16);
+  __setup_start = .;
+  .init.setup : { *(.init.setup) }
+  __setup_end = .;
+  __initcall_start = .;
+  .initcall.init : {
+        *(.initcall1.init) 
+        *(.initcall2.init) 
+        *(.initcall3.init) 
+        *(.initcall4.init) 
+        *(.initcall5.init) 
+        *(.initcall6.init) 
+        *(.initcall7.init)
+  }
+  __initcall_end = .;
+  __con_initcall_start = .;
+  .con_initcall.init : { *(.con_initcall.init) }
+  __con_initcall_end = .;
+  SECURITY_INIT
+  . = ALIGN(8);
+  __alt_instructions = .;
+  .altinstructions : { *(.altinstructions) } 
+  __alt_instructions_end = .; 
+ .altinstr_replacement : { *(.altinstr_replacement) }
+  /* .exit.text is discard at runtime, not link time, to deal with references
+     from .altinstructions and .eh_frame */
+  .exit.text : { *(.exit.text) }
+  .exit.data : { *(.exit.data) }        
+  . = ALIGN(4096);
+  __initramfs_start = .;
+  .init.ramfs : { *(.init.ramfs) }
+  __initramfs_end = .;  
+  . = ALIGN(32);
+  __per_cpu_start = .;
+  .data.percpu  : { *(.data.percpu) }
+  __per_cpu_end = .;
+  . = ALIGN(4096);
+  __init_end = .;
+  . = ALIGN(4096);
+  __nosave_begin = .;
+  .data_nosave : { *(.data.nosave) }
+  . = ALIGN(4096);
+  __nosave_end = .;
+  _end = . ;
+  /* Sections to be discarded */
+  /DISCARD/ : {
+        *(.exitcall.exit)
+#ifndef CONFIG_DEBUG_INFO
+        *(.eh_frame)
+#endif
+        }
+  /* DWARF 2 */
+  .debug_info     0 : { *(.debug_info) }
+  .debug_abbrev   0 : { *(.debug_abbrev) }
+  .debug_line     0 : { *(.debug_line) }
+  .debug_frame    0 : { *(.debug_frame) }
+  .debug_str      0 : { *(.debug_str) }
+  .debug_loc      0 : { *(.debug_loc) }
+  .debug_macinfo  0 : { *(.debug_macinfo) }
+  /* SGI/MIPS DWARF 2 extensions */
+  .debug_weaknames 0 : { *(.debug_weaknames) }
+  .debug_funcnames 0 : { *(.debug_funcnames) }
+  .debug_typenames 0 : { *(.debug_typenames) }
+  .debug_varnames  0 : { *(.debug_varnames) }
+  .comment 0 : { *(.comment) }
+}
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
new file mode 100644
index 000000000000..b4b8dc59663a
--- /dev/null
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -0,0 +1,225 @@
+/*
+ *  linux/arch/x86_64/kernel/vsyscall.c
+ *
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ *  Thanks to hpa@transmeta.com for some useful hint.
+ *  Special thanks to Ingo Molnar for his early experience with
+ *  a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
+ *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
+ *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
+ *  jumping out of line if necessary. We cannot add more with this
+ *  mechanism because older kernels won't return -ENOSYS.
+ *  If we want more than four we need a vDSO.
+ *
+ *  Note: the concept clashes with user mode linux. If you use UML and
+ *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
+ */
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/seqlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysctl.h>
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/fixmap.h>
+#include <asm/errno.h>
+#include <asm/io.h>
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define force_inline __attribute__((always_inline)) inline
+int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
+seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
+#include <asm/unistd.h>
+static force_inline void timeval_normalize(struct timeval * tv)
+{
+        time_t __sec;
+        __sec = tv->tv_usec / 1000000;
+        if (__sec) {
+                tv->tv_usec %= 1000000;
+                tv->tv_sec += __sec;
+        }
+}
+static force_inline void do_vgettimeofday(struct timeval * tv)
+{
+        long sequence, t;
+        unsigned long sec, usec;
+        do {
+                sequence = read_seqbegin(&__xtime_lock);
+                
+                sec = __xtime.tv_sec;
+                usec = (__xtime.tv_nsec / 1000) +
+                        (__jiffies - __wall_jiffies) * (1000000 / HZ);
+                if (__vxtime.mode == VXTIME_TSC) {
+                        sync_core();
+                        rdtscll(t);
+                        if (t < __vxtime.last_tsc)
+                                t = __vxtime.last_tsc;
+                        usec += ((t - __vxtime.last_tsc) *
+                                 __vxtime.tsc_quot) >> 32;
+                        /* See comment in x86_64 do_gettimeofday. */
+                } else {
+                        usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
+                                  __vxtime.last) * __vxtime.quot) >> 32;
+                }
+        } while (read_seqretry(&__xtime_lock, sequence));
+        tv->tv_sec = sec + usec / 1000000;
+        tv->tv_usec = usec % 1000000;
+}
+/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
+static force_inline void do_get_tz(struct timezone * tz)
+{
+        *tz = __sys_tz;
+}
+static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+        int ret;
+        asm volatile("vsysc2: syscall"
+                : "=a" (ret)
+                : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
+        return ret;
+}
+static force_inline long time_syscall(long *t)
+{
+        long secs;
+        asm volatile("vsysc1: syscall"
+                : "=a" (secs)
+                : "0" (__NR_time),"D" (t) : __syscall_clobber);
+        return secs;
+}
+static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
+{
+        if (unlikely(!__sysctl_vsyscall))
+                return gettimeofday(tv,tz);
+        if (tv)
+                do_vgettimeofday(tv);
+        if (tz)
+                do_get_tz(tz);
+        return 0;
+}
+/* This will break when the xtime seconds get inaccurate, but that is
+ * unlikely */
+static time_t __vsyscall(1) vtime(time_t *t)
+{
+        if (unlikely(!__sysctl_vsyscall))
+                return time_syscall(t);
+        else if (t)
+                *t = __xtime.tv_sec;            
+        return __xtime.tv_sec;
+}
+static long __vsyscall(2) venosys_0(void)
+{
+        return -ENOSYS;
+}
+static long __vsyscall(3) venosys_1(void)
+{
+        return -ENOSYS;
+}
+#ifdef CONFIG_SYSCTL
+#define SYSCALL 0x050f
+#define NOP2    0x9090
+/*
+ * NOP out syscall in vsyscall page when not needed.
+ */
+static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        extern u16 vsysc1, vsysc2;
+        u16 *map1, *map2;
+        int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+        if (!write)
+                return ret;
+        /* gcc has some trouble with __va(__pa()), so just do it this
+           way. */
+        map1 = ioremap(__pa_symbol(&vsysc1), 2);
+        if (!map1)
+                return -ENOMEM;
+        map2 = ioremap(__pa_symbol(&vsysc2), 2);
+        if (!map2) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        if (!sysctl_vsyscall) {
+                *map1 = SYSCALL;
+                *map2 = SYSCALL;
+        } else {
+                *map1 = NOP2;
+                *map2 = NOP2;
+        }
+        iounmap(map2);
+out:
+        iounmap(map1);
+        return ret;
+}
+static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
+                                void __user *oldval, size_t __user *oldlenp,
+                                void __user *newval, size_t newlen,
+                                void **context)
+{
+        return -ENOSYS;
+}
+static ctl_table kernel_table2[] = {
+        { .ctl_name = 99, .procname = "vsyscall64",
+          .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
+          .strategy = vsyscall_sysctl_nostrat,
+          .proc_handler = vsyscall_sysctl_change },
+        { 0, }
+};
+static ctl_table kernel_root_table2[] = {
+        { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
+          .child = kernel_table2 },
+        { 0 },
+};
+#endif
+static void __init map_vsyscall(void)
+{
+        extern char __vsyscall_0;
+        unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+        __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+}
+static int __init vsyscall_init(void)
+{
+        BUG_ON(((unsigned long) &vgettimeofday !=
+                        VSYSCALL_ADDR(__NR_vgettimeofday)));
+        BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
+        BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
+        map_vsyscall();
+        sysctl_vsyscall = 1;
+        register_sysctl_table(kernel_root_table2, 0);
+        return 0;
+}
+__initcall(vsyscall_init);
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
new file mode 100644
index 000000000000..88626e626886
--- /dev/null
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -0,0 +1,221 @@
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/user.h>
+#include <linux/sched.h>
+#include <linux/in6.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/pm.h>
+#include <linux/pci.h>
+#include <linux/apm_bios.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/syscalls.h>
+#include <linux/tty.h>
+#include <linux/ioctl32.h>
+#include <asm/semaphore.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/uaccess.h>
+#include <asm/checksum.h>
+#include <asm/io.h>
+#include <asm/delay.h>
+#include <asm/irq.h>
+#include <asm/mmx.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/nmi.h>
+#include <asm/kdebug.h>
+#include <asm/unistd.h>
+#include <asm/tlbflush.h>
+#include <asm/kdebug.h>
+extern spinlock_t rtc_lock;
+#ifdef CONFIG_SMP
+extern void __write_lock_failed(rwlock_t *rw);
+extern void __read_lock_failed(rwlock_t *rw);
+#endif
+#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
+extern struct drive_info_struct drive_info;
+EXPORT_SYMBOL(drive_info);
+#endif
+extern unsigned long get_cmos_time(void);
+/* platform dependent support */
+EXPORT_SYMBOL(boot_cpu_data);
+//EXPORT_SYMBOL(dump_fpu);
+EXPORT_SYMBOL(__ioremap);
+EXPORT_SYMBOL(ioremap_nocache);
+EXPORT_SYMBOL(iounmap);
+EXPORT_SYMBOL(enable_irq);
+EXPORT_SYMBOL(disable_irq);
+EXPORT_SYMBOL(disable_irq_nosync);
+EXPORT_SYMBOL(probe_irq_mask);
+EXPORT_SYMBOL(kernel_thread);
+EXPORT_SYMBOL(pm_idle);
+EXPORT_SYMBOL(pm_power_off);
+EXPORT_SYMBOL(get_cmos_time);
+EXPORT_SYMBOL(__down_failed);
+EXPORT_SYMBOL(__down_failed_interruptible);
+EXPORT_SYMBOL(__down_failed_trylock);
+EXPORT_SYMBOL(__up_wakeup);
+/* Networking helper routines. */
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
+EXPORT_SYMBOL(ip_compute_csum);
+/* Delay loops */
+EXPORT_SYMBOL(__udelay);
+EXPORT_SYMBOL(__ndelay);
+EXPORT_SYMBOL(__delay);
+EXPORT_SYMBOL(__const_udelay);
+EXPORT_SYMBOL(__get_user_1);
+EXPORT_SYMBOL(__get_user_2);
+EXPORT_SYMBOL(__get_user_4);
+EXPORT_SYMBOL(__get_user_8);
+EXPORT_SYMBOL(__put_user_1);
+EXPORT_SYMBOL(__put_user_2);
+EXPORT_SYMBOL(__put_user_4);
+EXPORT_SYMBOL(__put_user_8);
+EXPORT_SYMBOL(strpbrk);
+EXPORT_SYMBOL(strstr);
+EXPORT_SYMBOL(strncpy_from_user);
+EXPORT_SYMBOL(__strncpy_from_user);
+EXPORT_SYMBOL(clear_user);
+EXPORT_SYMBOL(__clear_user);
+EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(copy_to_user);
+EXPORT_SYMBOL(copy_in_user);
+EXPORT_SYMBOL(strnlen_user);
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_alloc_consistent);
+EXPORT_SYMBOL(pci_free_consistent);
+#endif
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+EXPORT_SYMBOL(copy_page);
+EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(cpu_pda);
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL(cpu_data);
+EXPORT_SYMBOL(cpu_online_map);
+EXPORT_SYMBOL(__write_lock_failed);
+EXPORT_SYMBOL(__read_lock_failed);
+EXPORT_SYMBOL(synchronize_irq);
+EXPORT_SYMBOL(smp_call_function);
+EXPORT_SYMBOL(cpu_callout_map);
+#endif
+#ifdef CONFIG_VT
+EXPORT_SYMBOL(screen_info);
+#endif
+EXPORT_SYMBOL(get_wchan);
+EXPORT_SYMBOL(rtc_lock);
+EXPORT_SYMBOL_GPL(set_nmi_callback);
+EXPORT_SYMBOL_GPL(unset_nmi_callback);
+/* Export string functions. We normally rely on gcc builtin for most of these,
+   but gcc sometimes decides not to inline them. */    
+#undef memcpy
+#undef memset
+#undef memmove
+#undef memchr
+#undef strlen
+#undef strcpy
+#undef strncmp
+#undef strncpy
+#undef strchr   
+#undef strcmp 
+#undef strcpy 
+#undef strcat
+#undef memcmp
+extern void * memset(void *,int,__kernel_size_t);
+extern size_t strlen(const char *);
+extern void * memmove(void * dest,const void *src,size_t count);
+extern char * strcpy(char * dest,const char *src);
+extern int strcmp(const char * cs,const char * ct);
+extern void *memchr(const void *s, int c, size_t n);
+extern void * memcpy(void *,const void *,__kernel_size_t);
+extern void * __memcpy(void *,const void *,__kernel_size_t);
+extern char * strcat(char *, const char *);
+extern int memcmp(const void * cs,const void * ct,size_t count);
+EXPORT_SYMBOL(memset);
+EXPORT_SYMBOL(strlen);
+EXPORT_SYMBOL(memmove);
+EXPORT_SYMBOL(strcpy);
+EXPORT_SYMBOL(strncmp);
+EXPORT_SYMBOL(strncpy);
+EXPORT_SYMBOL(strchr);
+EXPORT_SYMBOL(strcmp);
+EXPORT_SYMBOL(strcat);
+EXPORT_SYMBOL(strncat);
+EXPORT_SYMBOL(memchr);
+EXPORT_SYMBOL(strrchr);
+EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(memscan);
+EXPORT_SYMBOL(memcpy);
+EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(memcmp);
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
+/* prototypes are wrong, these are assembly with custom calling functions */
+extern void rwsem_down_read_failed_thunk(void);
+extern void rwsem_wake_thunk(void);
+extern void rwsem_downgrade_thunk(void);
+extern void rwsem_down_write_failed_thunk(void);
+EXPORT_SYMBOL(rwsem_down_read_failed_thunk);
+EXPORT_SYMBOL(rwsem_wake_thunk);
+EXPORT_SYMBOL(rwsem_downgrade_thunk);
+EXPORT_SYMBOL(rwsem_down_write_failed_thunk);
+#endif
+EXPORT_SYMBOL(empty_zero_page);
+#ifdef CONFIG_HAVE_DEC_LOCK
+EXPORT_SYMBOL(_atomic_dec_and_lock);
+#endif
+EXPORT_SYMBOL(die_chain);
+EXPORT_SYMBOL(register_die_notifier);
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL(cpu_sibling_map);
+EXPORT_SYMBOL(smp_num_siblings);
+#endif
+extern void do_softirq_thunk(void);
+EXPORT_SYMBOL(do_softirq_thunk);
+void out_of_line_bug(void);
+EXPORT_SYMBOL(out_of_line_bug);
+EXPORT_SYMBOL(init_level4_pgt);
+extern unsigned long __supported_pte_mask;
+EXPORT_SYMBOL(__supported_pte_mask);
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL(flush_tlb_page);
+#endif
+EXPORT_SYMBOL(cpu_khz);
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/x86_64/kernel