19 files changed, 1954 insertions, 0 deletions
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
new file mode 100644
index 000000000000..6b26a1c1e9ff
--- /dev/null
+++ b/arch/x86_64/lib/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for x86_64-specific library files.
+#
+CFLAGS_csum-partial.o := -funroll-loops
+obj-y := io.o
+lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
+        usercopy.o getuser.o putuser.o  \
+        thunk.o clear_page.o copy_page.o bitstr.o bitops.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o
+lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
diff --git a/arch/x86_64/lib/bitops.c b/arch/x86_64/lib/bitops.c
new file mode 100644
index 000000000000..a29fb75b33ac
--- /dev/null
+++ b/arch/x86_64/lib/bitops.c
@@ -0,0 +1,141 @@
+#include <linux/bitops.h>
+#undef find_first_zero_bit
+#undef find_next_zero_bit
+#undef find_first_bit
+#undef find_next_bit
+/**
+ * find_first_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first zero bit, not the number of the byte
+ * containing a bit.
+ */
+inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+        long d0, d1, d2;
+        long res;
+        if (!size)
+                return 0;
+        asm volatile(
+                "  repe; scasq\n"
+                "  je 1f\n"
+                "  xorq -8(%%rdi),%%rax\n"
+                "  subq $8,%%rdi\n"
+                "  bsfq %%rax,%%rdx\n"
+                "1:  subq %[addr],%%rdi\n"
+                "  shlq $3,%%rdi\n"
+                "  addq %%rdi,%%rdx"
+                :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
+                :"0" (0ULL), "1" ((size + 63) >> 6), "2" (addr), "3" (-1ULL),
+                 [addr] "r" (addr) : "memory");
+        return res;
+}
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_zero_bit (const unsigned long * addr, long size, long offset)
+{
+        unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
+        unsigned long set = 0;
+        unsigned long res, bit = offset&63;
+        if (bit) {
+                /*
+                 * Look for zero in first word
+                 */
+                asm("bsfq %1,%0\n\t"
+                    "cmoveq %2,%0"
+                    : "=r" (set)
+                    : "r" (~(*p >> bit)), "r"(64L));
+                if (set < (64 - bit))
+                        return set + offset;
+                set = 64 - bit;
+                p++;
+        }
+        /*
+         * No zero yet, search remaining full words for a zero
+         */
+        res = find_first_zero_bit ((const unsigned long *)p,
+                                   size - 64 * (p - (unsigned long *) addr));
+        return (offset + set + res);
+}
+static inline long
+__find_first_bit(const unsigned long * addr, unsigned long size)
+{
+        long d0, d1;
+        long res;
+        asm volatile(
+                "   repe; scasq\n"
+                "   jz 1f\n"
+                "   subq $8,%%rdi\n"
+                "   bsfq (%%rdi),%%rax\n"
+                "1: subq %[addr],%%rdi\n"
+                "   shlq $3,%%rdi\n"
+                "   addq %%rdi,%%rax"
+                :"=a" (res), "=&c" (d0), "=&D" (d1)
+                :"0" (0ULL),
+                 "1" ((size + 63) >> 6), "2" (addr),
+                 [addr] "r" (addr) : "memory");
+        return res;
+}
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_bit(const unsigned long * addr, unsigned long size)
+{
+        return __find_first_bit(addr,size);
+}
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_bit(const unsigned long * addr, long size, long offset)
+{
+        const unsigned long * p = addr + (offset >> 6);
+        unsigned long set = 0, bit = offset & 63, res;
+        if (bit) {
+                /*
+                 * Look for nonzero in the first 64 bits:
+                 */
+                asm("bsfq %1,%0\n\t"
+                    "cmoveq %2,%0\n\t"
+                    : "=r" (set)
+                    : "r" (*p >> bit), "r" (64L));
+                if (set < (64 - bit))
+                        return set + offset;
+                set = 64 - bit;
+                p++;
+        }
+        /*
+         * No set bit yet, search remaining full words for a bit
+         */
+        res = __find_first_bit (p, size - 64 * (p - addr));
+        return (offset + set + res);
+}
+#include <linux/module.h>
+EXPORT_SYMBOL(find_next_bit);
+EXPORT_SYMBOL(find_first_bit);
+EXPORT_SYMBOL(find_first_zero_bit);
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86_64/lib/bitstr.c b/arch/x86_64/lib/bitstr.c
new file mode 100644
index 000000000000..24676609a6ac
--- /dev/null
+++ b/arch/x86_64/lib/bitstr.c
@@ -0,0 +1,28 @@
+#include <linux/module.h>
+#include <linux/bitops.h>
+/* Find string of zero bits in a bitmap */ 
+unsigned long 
+find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
+{ 
+        unsigned long n, end, i;        
+ again:
+        n = find_next_zero_bit(bitmap, nbits, start);
+        if (n == -1) 
+                return -1;
+        
+        /* could test bitsliced, but it's hardly worth it */
+        end = n+len;
+        if (end >= nbits) 
+                return -1; 
+        for (i = n+1; i < end; i++) { 
+                if (test_bit(i, bitmap)) {  
+                        start = i+1; 
+                        goto again; 
+                } 
+        }
+        return n;
+}
+EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
new file mode 100644
index 000000000000..30a9da458c15
--- /dev/null
+++ b/arch/x86_64/lib/clear_page.S
@@ -0,0 +1,50 @@
+/*
+ * Zero a page.         
+ * rdi  page
+ */                     
+        .globl clear_page
+        .p2align 4
+clear_page:
+        xorl   %eax,%eax
+        movl   $4096/64,%ecx
+        .p2align 4
+.Lloop:
+        decl    %ecx
+#define PUT(x) movq %rax,x*8(%rdi) 
+        movq %rax,(%rdi)
+        PUT(1)
+        PUT(2)
+        PUT(3)
+        PUT(4)
+        PUT(5)
+        PUT(6)
+        PUT(7)
+        leaq    64(%rdi),%rdi
+        jnz     .Lloop
+        nop
+        ret
+clear_page_end: 
+        
+        /* C stepping K8 run faster using the string instructions.
+           It is also a lot simpler. Use this when possible */
+        
+#include <asm/cpufeature.h>
+                
+        .section .altinstructions,"a"
+        .align 8
+        .quad  clear_page
+        .quad  clear_page_c
+        .byte  X86_FEATURE_K8_C
+        .byte  clear_page_end-clear_page        
+        .byte  clear_page_c_end-clear_page_c
+        .previous
+        .section .altinstr_replacement,"ax"
+clear_page_c:
+        movl $4096/8,%ecx
+        xorl %eax,%eax
+        rep 
+        stosq
+        ret
+clear_page_c_end:
+        .previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
new file mode 100644
index 000000000000..dd3aa47b6bf5
--- /dev/null
+++ b/arch/x86_64/lib/copy_page.S
@@ -0,0 +1,101 @@
+/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+        
+/* Don't use streaming store because it's better when the target
+   ends up in cache. */
+            
+/* Could vary the prefetch distance based on SMP/UP */
+        .globl copy_page
+        .p2align 4
+copy_page:
+        subq    $3*8,%rsp
+        movq    %rbx,(%rsp)
+        movq    %r12,1*8(%rsp)
+        movq    %r13,2*8(%rsp)
+                        
+        movl    $(4096/64)-5,%ecx
+        .p2align 4
+.Loop64:        
+        dec     %rcx
+        movq        (%rsi), %rax
+        movq      8 (%rsi), %rbx
+        movq     16 (%rsi), %rdx
+        movq     24 (%rsi), %r8
+        movq     32 (%rsi), %r9
+        movq     40 (%rsi), %r10
+        movq     48 (%rsi), %r11
+        movq     56 (%rsi), %r12
+        prefetcht0 5*64(%rsi)
+        movq     %rax,    (%rdi)
+        movq     %rbx,  8 (%rdi)
+        movq     %rdx, 16 (%rdi)
+        movq     %r8,  24 (%rdi)
+        movq     %r9,  32 (%rdi)
+        movq     %r10, 40 (%rdi)
+        movq     %r11, 48 (%rdi)
+        movq     %r12, 56 (%rdi)
+        leaq    64 (%rsi), %rsi
+        leaq    64 (%rdi), %rdi
+        jnz     .Loop64
+        movl    $5,%ecx
+        .p2align 4
+.Loop2: 
+        decl   %ecx
+        movq        (%rsi), %rax
+        movq      8 (%rsi), %rbx
+        movq     16 (%rsi), %rdx
+        movq     24 (%rsi), %r8
+        movq     32 (%rsi), %r9
+        movq     40 (%rsi), %r10
+        movq     48 (%rsi), %r11
+        movq     56 (%rsi), %r12
+        movq     %rax,    (%rdi)
+        movq     %rbx,  8 (%rdi)
+        movq     %rdx, 16 (%rdi)
+        movq     %r8,  24 (%rdi)
+        movq     %r9,  32 (%rdi)
+        movq     %r10, 40 (%rdi)
+        movq     %r11, 48 (%rdi)
+        movq     %r12, 56 (%rdi)
+        
+        leaq    64(%rdi),%rdi                   
+        leaq    64(%rsi),%rsi                   
+        
+        jnz     .Loop2          
+        
+        movq    (%rsp),%rbx
+        movq    1*8(%rsp),%r12
+        movq    2*8(%rsp),%r13
+        addq    $3*8,%rsp
+        ret
+        
+        /* C stepping K8 run faster using the string copy instructions.
+           It is also a lot simpler. Use this when possible */
+#include <asm/cpufeature.h>             
+                
+        .section .altinstructions,"a"
+        .align 8
+        .quad  copy_page
+        .quad  copy_page_c
+        .byte  X86_FEATURE_K8_C
+        .byte  copy_page_c_end-copy_page_c
+        .byte  copy_page_c_end-copy_page_c
+        .previous
+        .section .altinstr_replacement,"ax"
+copy_page_c:
+        movl $4096/8,%ecx
+        rep 
+        movsq 
+        ret
+copy_page_c_end:
+        .previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
new file mode 100644
index 000000000000..bd556c804248
--- /dev/null
+++ b/arch/x86_64/lib/copy_user.S
@@ -0,0 +1,294 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ * 
+ * Functions to copy from and to user space.            
+ */              
+#define FIX_ALIGNMENT 1
+                
+        #include <asm/current.h>
+        #include <asm/offset.h>
+        #include <asm/thread_info.h>
+        #include <asm/cpufeature.h>
+/* Standard copy_to_user with segment limit checking */         
+        .globl copy_to_user
+        .p2align 4      
+copy_to_user:
+        GET_THREAD_INFO(%rax)
+        movq %rdi,%rcx
+        addq %rdx,%rcx
+        jc  bad_to_user
+        cmpq threadinfo_addr_limit(%rax),%rcx
+        jae bad_to_user
+2:      
+        .byte 0xe9      /* 32bit jump */
+        .long .Lcug-1f
+1:
+        .section .altinstr_replacement,"ax"
+3:      .byte 0xe9                      /* replacement jmp with 8 bit immediate */
+        .long copy_user_generic_c-1b    /* offset */
+        .previous
+        .section .altinstructions,"a"
+        .align 8
+        .quad  2b
+        .quad  3b
+        .byte  X86_FEATURE_K8_C
+        .byte  5
+        .byte  5
+        .previous
+/* Standard copy_from_user with segment limit checking */       
+        .globl copy_from_user
+        .p2align 4      
+copy_from_user:
+        GET_THREAD_INFO(%rax)
+        movq %rsi,%rcx
+        addq %rdx,%rcx
+        jc  bad_from_user
+        cmpq threadinfo_addr_limit(%rax),%rcx
+        jae  bad_from_user
+        /* FALL THROUGH to copy_user_generic */
+        
+        .section .fixup,"ax"
+        /* must zero dest */
+bad_from_user:
+        movl %edx,%ecx
+        xorl %eax,%eax
+        rep
+        stosb
+bad_to_user:
+        movl    %edx,%eax
+        ret
+        .previous
+        
+                
+/*
+ * copy_user_generic - memory copy with exception handling.
+ *      
+ * Input:       
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:              
+ * eax uncopied bytes or 0 if successful.
+ */
+        .globl copy_user_generic        
+        .p2align 4
+copy_user_generic:      
+        .byte 0x66,0x66,0x90    /* 5 byte nop for replacement jump */   
+        .byte 0x66,0x90
+1:              
+        .section .altinstr_replacement,"ax"
+2:      .byte 0xe9                   /* near jump with 32bit immediate */
+        .long copy_user_generic_c-1b /* offset */
+        .previous
+        .section .altinstructions,"a"
+        .align 8
+        .quad  copy_user_generic
+        .quad  2b
+        .byte  X86_FEATURE_K8_C
+        .byte  5
+        .byte  5
+        .previous
+.Lcug:  
+        pushq %rbx
+        xorl %eax,%eax          /*zero for the exception handler */
+#ifdef FIX_ALIGNMENT
+        /* check for bad alignment of destination */
+        movl %edi,%ecx
+        andl $7,%ecx
+        jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+        movq %rdx,%rcx
+        movl $64,%ebx   
+        shrq $6,%rdx
+        decq %rdx
+        js   .Lhandle_tail
+        
+        .p2align 4
+.Lloop:
+.Ls1:   movq (%rsi),%r11
+.Ls2:   movq 1*8(%rsi),%r8
+.Ls3:   movq 2*8(%rsi),%r9
+.Ls4:   movq 3*8(%rsi),%r10
+.Ld1:   movq %r11,(%rdi)
+.Ld2:   movq %r8,1*8(%rdi)
+.Ld3:   movq %r9,2*8(%rdi)
+.Ld4:   movq %r10,3*8(%rdi)
+                
+.Ls5:   movq 4*8(%rsi),%r11
+.Ls6:   movq 5*8(%rsi),%r8
+.Ls7:   movq 6*8(%rsi),%r9
+.Ls8:   movq 7*8(%rsi),%r10
+.Ld5:   movq %r11,4*8(%rdi)
+.Ld6:   movq %r8,5*8(%rdi)
+.Ld7:   movq %r9,6*8(%rdi)
+.Ld8:   movq %r10,7*8(%rdi)
+        
+        decq %rdx
+        leaq 64(%rsi),%rsi
+        leaq 64(%rdi),%rdi
+        
+        jns  .Lloop
+        .p2align 4
+.Lhandle_tail:
+        movl %ecx,%edx
+        andl $63,%ecx
+        shrl $3,%ecx
+        jz   .Lhandle_7
+        movl $8,%ebx
+        .p2align 4
+.Lloop_8:
+.Ls9:   movq (%rsi),%r8
+.Ld9:   movq %r8,(%rdi)
+        decl %ecx
+        leaq 8(%rdi),%rdi
+        leaq 8(%rsi),%rsi
+        jnz .Lloop_8
+        
+.Lhandle_7:             
+        movl %edx,%ecx  
+        andl $7,%ecx
+        jz   .Lende
+        .p2align 4
+.Lloop_1:
+.Ls10:  movb (%rsi),%bl
+.Ld10:  movb %bl,(%rdi)
+        incq %rdi
+        incq %rsi
+        decl %ecx
+        jnz .Lloop_1
+                        
+.Lende:
+        popq %rbx
+        ret     
+#ifdef FIX_ALIGNMENT                            
+        /* align destination */
+        .p2align 4
+.Lbad_alignment:
+        movl $8,%r9d
+        subl %ecx,%r9d
+        movl %r9d,%ecx
+        cmpq %r9,%rdx
+        jz   .Lhandle_7
+        js   .Lhandle_7
+.Lalign_1:              
+.Ls11:  movb (%rsi),%bl
+.Ld11:  movb %bl,(%rdi)
+        incq %rsi
+        incq %rdi
+        decl %ecx
+        jnz .Lalign_1
+        subq %r9,%rdx
+        jmp .Lafter_bad_alignment
+#endif
+        
+        /* table sorted by exception address */ 
+        .section __ex_table,"a"
+        .align 8
+        .quad .Ls1,.Ls1e
+        .quad .Ls2,.Ls2e
+        .quad .Ls3,.Ls3e
+        .quad .Ls4,.Ls4e        
+        .quad .Ld1,.Ls1e
+        .quad .Ld2,.Ls2e
+        .quad .Ld3,.Ls3e
+        .quad .Ld4,.Ls4e
+        .quad .Ls5,.Ls5e
+        .quad .Ls6,.Ls6e
+        .quad .Ls7,.Ls7e
+        .quad .Ls8,.Ls8e        
+        .quad .Ld5,.Ls5e
+        .quad .Ld6,.Ls6e
+        .quad .Ld7,.Ls7e
+        .quad .Ld8,.Ls8e
+        .quad .Ls9,.Le_quad
+        .quad .Ld9,.Le_quad
+        .quad .Ls10,.Le_byte
+        .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT    
+        .quad .Ls11,.Lzero_rest
+        .quad .Ld11,.Lzero_rest
+#endif
+        .quad .Le5,.Le_zero
+        .previous
+        /* compute 64-offset for main loop. 8 bytes accuracy with error on the 
+           pessimistic side. this is gross. it would be better to fix the 
+           interface. */        
+        /* eax: zero, ebx: 64 */
+.Ls1e:  addl $8,%eax
+.Ls2e:  addl $8,%eax
+.Ls3e:  addl $8,%eax
+.Ls4e:  addl $8,%eax
+.Ls5e:  addl $8,%eax
+.Ls6e:  addl $8,%eax
+.Ls7e:  addl $8,%eax
+.Ls8e:  addl $8,%eax
+        addq %rbx,%rdi  /* +64 */
+        subq %rax,%rdi  /* correct destination with computed offset */
+        shlq $6,%rdx    /* loop counter * 64 (stride length) */
+        addq %rax,%rdx  /* add offset to loopcnt */
+        andl $63,%ecx   /* remaining bytes */
+        addq %rcx,%rdx  /* add them */
+        jmp .Lzero_rest
+        /* exception on quad word loop in tail handling */
+        /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+        shll $3,%ecx
+        andl $7,%edx
+        addl %ecx,%edx
+        /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+        movq %rdx,%rcx
+.Le_byte:
+        xorl %eax,%eax
+.Le5:   rep 
+        stosb
+        /* when there is another exception while zeroing the rest just return */
+.Le_zero:               
+        movq %rdx,%rax
+        jmp .Lende
+        /* C stepping K8 run faster using the string copy instructions.
+           This is also a lot simpler. Use them when possible.
+           Patch in jmps to this code instead of copying it fully
+           to avoid unwanted aliasing in the exception tables. */
+                
+ /* rdi destination
+  * rsi source
+  * rdx count
+  *
+  * Output:             
+  * eax uncopied bytes or 0 if successfull.
+  */                    
+copy_user_generic_c:
+        movl %edx,%ecx
+        shrl $3,%ecx
+        andl $7,%edx    
+1:      rep 
+        movsq 
+        movl %edx,%ecx
+2:      rep
+        movsb
+4:      movl %ecx,%eax
+        ret
+3:      lea (%rdx,%rcx,8),%rax
+        ret
+        
+        .section __ex_table,"a"
+        .quad 1b,3b
+        .quad 2b,4b
+        .previous
diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S
new file mode 100644
index 000000000000..01808ec37836
--- /dev/null
+++ b/arch/x86_64/lib/csum-copy.S
@@ -0,0 +1,233 @@
+/*
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ *      
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details. No warranty for anything given at all.
+ */
+        #include <linux/linkage.h>
+        #include <asm/errno.h>
+/*
+ * Checksum copy with exception handling.
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 
+ * destination is zeroed.
+ * 
+ * Input
+ * rdi  source
+ * rsi  destination
+ * edx  len (32bit)
+ * ecx  sum (32bit) 
+ * r8   src_err_ptr (int)
+ * r9   dst_err_ptr (int)
+ *
+ * Output
+ * eax  64bit sum. undefined in case of exception.
+ * 
+ * Wrappers need to take care of valid exception sum and zeroing.                
+ * They also should align source or destination to 8 bytes.
+ */
+        .macro source
+10:
+        .section __ex_table,"a"
+        .align 8
+        .quad 10b,.Lbad_source
+        .previous
+        .endm
+                
+        .macro dest
+20:
+        .section __ex_table,"a"
+        .align 8
+        .quad 20b,.Lbad_dest
+        .previous
+        .endm
+                        
+        .macro ignore L=.Lignore
+30:
+        .section __ex_table,"a"
+        .align 8
+        .quad 30b,\L
+        .previous
+        .endm
+        
+                                
+        .globl csum_partial_copy_generic
+        .p2align 4
+csum_partial_copy_generic:
+        cmpl     $3*64,%edx
+        jle      .Lignore
+.Lignore:               
+        subq  $7*8,%rsp
+        movq  %rbx,2*8(%rsp)
+        movq  %r12,3*8(%rsp)
+        movq  %r14,4*8(%rsp)
+        movq  %r13,5*8(%rsp)
+        movq  %rbp,6*8(%rsp)
+        movq  %r8,(%rsp)
+        movq  %r9,1*8(%rsp)
+        
+        movl  %ecx,%eax
+        movl  %edx,%ecx
+        xorl  %r9d,%r9d
+        movq  %rcx,%r12
+        shrq  $6,%r12
+        jz    .Lhandle_tail       /* < 64 */
+        clc
+        
+        /* main loop. clear in 64 byte blocks */
+        /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
+        /* r11: temp3, rdx: temp4, r12 loopcnt */
+        /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
+        .p2align 4
+.Lloop:
+        source
+        movq  (%rdi),%rbx
+        source
+        movq  8(%rdi),%r8
+        source
+        movq  16(%rdi),%r11
+        source
+        movq  24(%rdi),%rdx
+        source
+        movq  32(%rdi),%r10
+        source
+        movq  40(%rdi),%rbp
+        source
+        movq  48(%rdi),%r14
+        source
+        movq  56(%rdi),%r13
+                
+        ignore 2f
+        prefetcht0 5*64(%rdi)
+2:                                                      
+        adcq  %rbx,%rax
+        adcq  %r8,%rax
+        adcq  %r11,%rax
+        adcq  %rdx,%rax
+        adcq  %r10,%rax
+        adcq  %rbp,%rax
+        adcq  %r14,%rax
+        adcq  %r13,%rax
+        decl %r12d
+        
+        dest
+        movq %rbx,(%rsi)
+        dest
+        movq %r8,8(%rsi)
+        dest
+        movq %r11,16(%rsi)
+        dest
+        movq %rdx,24(%rsi)
+        dest
+        movq %r10,32(%rsi)
+        dest
+        movq %rbp,40(%rsi)
+        dest
+        movq %r14,48(%rsi)
+        dest
+        movq %r13,56(%rsi)
+        
+3:
+        
+        leaq 64(%rdi),%rdi
+        leaq 64(%rsi),%rsi
+        jnz   .Lloop
+        adcq  %r9,%rax
+        /* do last upto 56 bytes */
+.Lhandle_tail:
+        /* ecx: count */
+        movl %ecx,%r10d
+        andl $63,%ecx
+        shrl $3,%ecx
+        jz       .Lfold
+        clc
+        .p2align 4
+.Lloop_8:       
+        source
+        movq (%rdi),%rbx
+        adcq %rbx,%rax
+        decl %ecx
+        dest
+        movq %rbx,(%rsi)
+        leaq 8(%rsi),%rsi /* preserve carry */
+        leaq 8(%rdi),%rdi
+        jnz     .Lloop_8
+        adcq %r9,%rax   /* add in carry */
+.Lfold:
+        /* reduce checksum to 32bits */
+        movl %eax,%ebx
+        shrq $32,%rax
+        addl %ebx,%eax
+        adcl %r9d,%eax
+        /* do last upto 6 bytes */      
+.Lhandle_7:
+        movl %r10d,%ecx
+        andl $7,%ecx
+        shrl $1,%ecx
+        jz   .Lhandle_1
+        movl $2,%edx
+        xorl %ebx,%ebx
+        clc  
+        .p2align 4
+.Lloop_1:       
+        source
+        movw (%rdi),%bx
+        adcl %ebx,%eax
+        dest
+        decl %ecx
+        movw %bx,(%rsi)
+        leaq 2(%rdi),%rdi
+        leaq 2(%rsi),%rsi
+        jnz .Lloop_1
+        adcl %r9d,%eax  /* add in carry */
+        
+        /* handle last odd byte */
+.Lhandle_1:
+        testl $1,%r10d
+        jz    .Lende
+        xorl  %ebx,%ebx
+        source
+        movb (%rdi),%bl
+        dest
+        movb %bl,(%rsi)
+        addl %ebx,%eax
+        adcl %r9d,%eax          /* carry */
+                        
+.Lende:
+        movq 2*8(%rsp),%rbx
+        movq 3*8(%rsp),%r12
+        movq 4*8(%rsp),%r14
+        movq 5*8(%rsp),%r13
+        movq 6*8(%rsp),%rbp
+        addq $7*8,%rsp
+        ret
+        /* Exception handlers. Very simple, zeroing is done in the wrappers */
+.Lbad_source:
+        movq (%rsp),%rax
+        testq %rax,%rax
+        jz   .Lende
+        movl $-EFAULT,(%rax)
+        jmp  .Lende
+        
+.Lbad_dest:
+        movq 8(%rsp),%rax
+        testq %rax,%rax
+        jz   .Lende     
+        movl $-EFAULT,(%rax)
+        jmp .Lende
diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c
new file mode 100644
index 000000000000..5384e227cdf6
--- /dev/null
+++ b/arch/x86_64/lib/csum-partial.c
@@ -0,0 +1,150 @@
+/*
+ * arch/x86_64/lib/csum-partial.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed.
+ */
+ 
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <asm/checksum.h>
+#define __force_inline inline __attribute__((always_inline))
+static inline unsigned short from32to16(unsigned a) 
+{
+        unsigned short b = a >> 16; 
+        asm("addw %w2,%w0\n\t"
+            "adcw $0,%w0\n" 
+            : "=r" (b)
+            : "0" (b), "r" (a));
+        return b;
+}
+/*
+ * Do a 64-bit checksum on an arbitrary memory area.
+ * Returns a 32bit checksum.
+ *
+ * This isn't as time critical as it used to be because many NICs
+ * do hardware checksumming these days.
+ * 
+ * Things tried and found to not make it faster:
+ * Manual Prefetching
+ * Unrolling to an 128 bytes inner loop.
+ * Using interleaving with more registers to break the carry chains.
+ */
+static __force_inline unsigned do_csum(const unsigned char *buff, unsigned len)
+{
+        unsigned odd, count;
+        unsigned long result = 0;
+        if (unlikely(len == 0))
+                return result; 
+        odd = 1 & (unsigned long) buff;
+        if (unlikely(odd)) {
+                result = *buff << 8;
+                len--;
+                buff++;
+        }
+        count = len >> 1;               /* nr of 16-bit words.. */
+        if (count) {
+                if (2 & (unsigned long) buff) {
+                        result += *(unsigned short *)buff;
+                        count--;
+                        len -= 2;
+                        buff += 2;
+                }
+                count >>= 1;            /* nr of 32-bit words.. */
+                if (count) {
+                        unsigned long zero;
+                        unsigned count64;
+                        if (4 & (unsigned long) buff) {
+                                result += *(unsigned int *) buff;
+                                count--;
+                                len -= 4;
+                                buff += 4;
+                        }
+                        count >>= 1;    /* nr of 64-bit words.. */
+                        /* main loop using 64byte blocks */
+                        zero = 0;
+                        count64 = count >> 3;
+                        while (count64) { 
+                                asm("addq 0*8(%[src]),%[res]\n\t"
+                                    "adcq 1*8(%[src]),%[res]\n\t"
+                                    "adcq 2*8(%[src]),%[res]\n\t"
+                                    "adcq 3*8(%[src]),%[res]\n\t"
+                                    "adcq 4*8(%[src]),%[res]\n\t"
+                                    "adcq 5*8(%[src]),%[res]\n\t"
+                                    "adcq 6*8(%[src]),%[res]\n\t"
+                                    "adcq 7*8(%[src]),%[res]\n\t"
+                                    "adcq %[zero],%[res]"
+                                    : [res] "=r" (result)
+                                    : [src] "r" (buff), [zero] "r" (zero),
+                                    "[res]" (result));
+                                buff += 64;
+                                count64--;
+                        }
+                        /* last upto 7 8byte blocks */
+                        count %= 8; 
+                        while (count) { 
+                                asm("addq %1,%0\n\t"
+                                    "adcq %2,%0\n" 
+                                            : "=r" (result)
+                                    : "m" (*(unsigned long *)buff), 
+                                    "r" (zero),  "0" (result));
+                                --count; 
+                                        buff += 8;
+                        }
+                        result = add32_with_carry(result>>32,
+                                                  result&0xffffffff); 
+                        if (len & 4) {
+                                result += *(unsigned int *) buff;
+                                buff += 4;
+                        }
+                }
+                if (len & 2) {
+                        result += *(unsigned short *) buff;
+                        buff += 2;
+                }
+        }
+        if (len & 1)
+                result += *buff;
+        result = add32_with_carry(result>>32, result & 0xffffffff); 
+        if (unlikely(odd)) { 
+                result = from32to16(result);
+                result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+        }
+        return result;
+}
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 64-bit boundary
+ */
+unsigned csum_partial(const unsigned char *buff, unsigned len, unsigned sum)
+{
+        return add32_with_carry(do_csum(buff, len), sum); 
+}
+EXPORT_SYMBOL(csum_partial);
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+unsigned short ip_compute_csum(unsigned char * buff, int len)
+{
+        return csum_fold(csum_partial(buff,len,0));
+}
diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c
new file mode 100644
index 000000000000..94323f20816e
--- /dev/null
+++ b/arch/x86_64/lib/csum-wrappers.c
@@ -0,0 +1,129 @@
+/* Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v.2
+ * 
+ * Wrappers of assembly checksum functions for x86-64.
+ */
+#include <asm/checksum.h>
+#include <linux/module.h>
+/** 
+ * csum_partial_copy_from_user - Copy and checksum from user space. 
+ * @src: source address (user space) 
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad source address.
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits. 
+ */ 
+unsigned int 
+csum_partial_copy_from_user(const unsigned char __user *src, unsigned char *dst,
+                            int len, unsigned int isum, int *errp)
+{ 
+        might_sleep();
+        *errp = 0;
+        if (likely(access_ok(VERIFY_READ,src, len))) { 
+                /* Why 6, not 7? To handle odd addresses aligned we
+                   would need to do considerable complications to fix the
+                   checksum which is defined as an 16bit accumulator. The
+                   fix alignment code is primarily for performance
+                   compatibility with 32bit and that will handle odd
+                   addresses slowly too. */
+                if (unlikely((unsigned long)src & 6)) {                 
+                        while (((unsigned long)src & 6) && len >= 2) { 
+                                __u16 val16;                    
+                                *errp = __get_user(val16, (__u16 __user *)src); 
+                                if (*errp)
+                                        return isum;
+                                *(__u16 *)dst = val16;
+                                isum = add32_with_carry(isum, val16); 
+                                src += 2; 
+                                dst += 2; 
+                                len -= 2;
+                        }
+                }
+                isum = csum_partial_copy_generic((__force void *)src,dst,len,isum,errp,NULL);
+                if (likely(*errp == 0)) 
+                        return isum;
+        } 
+        *errp = -EFAULT;
+        memset(dst,0,len); 
+        return isum;            
+} 
+EXPORT_SYMBOL(csum_partial_copy_from_user);
+/** 
+ * csum_partial_copy_to_user - Copy and checksum to user space. 
+ * @src: source address
+ * @dst: destination address (user space)
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad destination address.
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits.
+ */ 
+unsigned int 
+csum_partial_copy_to_user(unsigned const char *src, unsigned char __user *dst,
+                          int len, unsigned int isum, int *errp)
+{ 
+        might_sleep();
+        if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
+                *errp = -EFAULT;
+                return 0; 
+        }
+        if (unlikely((unsigned long)dst & 6)) {
+                while (((unsigned long)dst & 6) && len >= 2) { 
+                        __u16 val16 = *(__u16 *)src;
+                        isum = add32_with_carry(isum, val16);
+                        *errp = __put_user(val16, (__u16 __user *)dst);
+                        if (*errp)
+                                return isum;
+                        src += 2; 
+                        dst += 2; 
+                        len -= 2;
+                }
+        }
+        *errp = 0;
+        return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp); 
+} 
+EXPORT_SYMBOL(csum_partial_copy_to_user);
+/** 
+ * csum_partial_copy_nocheck - Copy and checksum.
+ * @src: source address
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ */ 
+unsigned int 
+csum_partial_copy_nocheck(const unsigned char *src, unsigned char *dst, int len, unsigned int sum)
+{ 
+        return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL);
+} 
+unsigned short csum_ipv6_magic(struct in6_addr *saddr, struct in6_addr *daddr,
+                               __u32 len, unsigned short proto, unsigned int sum) 
+{
+        __u64 rest, sum64;
+     
+        rest = (__u64)htonl(len) + (__u64)htons(proto) + (__u64)sum;
+        asm("  addq (%[saddr]),%[sum]\n"
+            "  adcq 8(%[saddr]),%[sum]\n"
+            "  adcq (%[daddr]),%[sum]\n" 
+            "  adcq 8(%[daddr]),%[sum]\n"
+            "  adcq $0,%[sum]\n"
+            : [sum] "=r" (sum64) 
+            : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr));
+        return csum_fold(add32_with_carry(sum64 & 0xffffffff, sum64>>32));
+}
+EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/x86_64/lib/dec_and_lock.c b/arch/x86_64/lib/dec_and_lock.c
new file mode 100644
index 000000000000..ab43394dc775
--- /dev/null
+++ b/arch/x86_64/lib/dec_and_lock.c
@@ -0,0 +1,40 @@
+/*
+ * x86 version of "atomic_dec_and_lock()" using
+ * the atomic "cmpxchg" instruction.
+ *
+ * (For CPU's lacking cmpxchg, we use the slow
+ * generic version, and this one never even gets
+ * compiled).
+ */
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+{
+        int counter;
+        int newcount;
+repeat:
+        counter = atomic_read(atomic);
+        newcount = counter-1;
+        if (!newcount)
+                goto slow_path;
+        asm volatile("lock; cmpxchgl %1,%2"
+                :"=a" (newcount)
+                :"r" (newcount), "m" (atomic->counter), "0" (counter));
+        /* If the above failed, "eax" will have changed */
+        if (newcount != counter)
+                goto repeat;
+        return 0;
+slow_path:
+        spin_lock(lock);
+        if (atomic_dec_and_test(atomic))
+                return 1;
+        spin_unlock(lock);
+        return 0;
+}
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
new file mode 100644
index 000000000000..6e2d66472eb1
--- /dev/null
+++ b/arch/x86_64/lib/delay.c
@@ -0,0 +1,48 @@
+/*
+ *      Precise Delay Loops for x86-64
+ *
+ *      Copyright (C) 1993 Linus Torvalds
+ *      Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ *      The __delay function must _NOT_ be inlined as its execution time
+ *      depends wildly on alignment on many x86 processors. 
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <asm/delay.h>
+#ifdef CONFIG_SMP
+#include <asm/smp.h>
+#endif
+int x86_udelay_tsc = 0;         /* Delay via TSC */
+void __delay(unsigned long loops)
+{
+        unsigned bclock, now;
+        
+        rdtscl(bclock);
+        do
+        {
+                rep_nop(); 
+                rdtscl(now);
+        }
+        while((now-bclock) < loops);
+}
+inline void __const_udelay(unsigned long xloops)
+{
+        __delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
+}
+void __udelay(unsigned long usecs)
+{
+        __const_udelay(usecs * 0x000010c6);  /* 2**32 / 1000000 */
+}
+void __ndelay(unsigned long nsecs)
+{
+        __const_udelay(nsecs * 0x00005);  /* 2**32 / 1000000000 (rounded up) */
+}
diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S
new file mode 100644
index 000000000000..f94ea8a44051
--- /dev/null
+++ b/arch/x86_64/lib/getuser.S
@@ -0,0 +1,101 @@
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+/*
+ * __get_user_X
+ *
+ * Inputs:      %rcx contains the address.
+ *              The register is modified, but all changes are undone
+ *              before returning because the C code doesn't know about it.
+ *
+ * Outputs:     %rax is error code (0 or -EFAULT)
+ *              %rdx contains zero-extended value
+ * 
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/offset.h>
+#include <asm/thread_info.h>
+        .text
+        .p2align 4
+.globl __get_user_1
+__get_user_1:   
+        GET_THREAD_INFO(%r8)
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae bad_get_user
+1:      movzb (%rcx),%edx
+        xorl %eax,%eax
+        ret
+        .p2align 4
+.globl __get_user_2
+__get_user_2:
+        GET_THREAD_INFO(%r8)
+        addq $1,%rcx
+        jc 20f
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae 20f
+        decq   %rcx
+2:      movzwl (%rcx),%edx
+        xorl %eax,%eax
+        ret
+20:     decq    %rcx
+        jmp     bad_get_user
+        .p2align 4
+.globl __get_user_4
+__get_user_4:
+        GET_THREAD_INFO(%r8)
+        addq $3,%rcx
+        jc 30f
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae 30f
+        subq $3,%rcx
+3:      movl (%rcx),%edx
+        xorl %eax,%eax
+        ret
+30:     subq $3,%rcx
+        jmp bad_get_user
+        .p2align 4
+.globl __get_user_8
+__get_user_8:
+        GET_THREAD_INFO(%r8)
+        addq $7,%rcx
+        jc bad_get_user
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae     bad_get_user
+        subq    $7,%rcx
+4:      movq (%rcx),%rdx
+        xorl %eax,%eax
+        ret
+40:     subq $7,%rcx
+        jmp bad_get_user
+bad_get_user:
+        xorl %edx,%edx
+        movq $(-EFAULT),%rax
+        ret
+.section __ex_table,"a"
+        .quad 1b,bad_get_user
+        .quad 2b,bad_get_user
+        .quad 3b,bad_get_user
+        .quad 4b,bad_get_user
+.previous
diff --git a/arch/x86_64/lib/io.c b/arch/x86_64/lib/io.c
new file mode 100644
index 000000000000..87b4a4e18039
--- /dev/null
+++ b/arch/x86_64/lib/io.c
@@ -0,0 +1,23 @@
+#include <linux/string.h>
+#include <asm/io.h>
+#include <linux/module.h>
+void __memcpy_toio(unsigned long dst,const void*src,unsigned len)
+{
+        __inline_memcpy((void *) dst,src,len);
+}
+EXPORT_SYMBOL(__memcpy_toio);
+void __memcpy_fromio(void *dst,unsigned long src,unsigned len)
+{
+        __inline_memcpy(dst,(const void *) src,len);
+}
+EXPORT_SYMBOL(__memcpy_fromio);
+void memset_io(volatile void __iomem *a, int b, size_t c)
+{
+        /* XXX: memset can mangle the IO patterns quite a bit.
+           perhaps it would be better to use a dumb one */
+        memset((void *)a,b,c);
+}
+EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
new file mode 100644
index 000000000000..c6c46494fef5
--- /dev/null
+++ b/arch/x86_64/lib/memcpy.S
@@ -0,0 +1,121 @@
+/* Copyright 2002 Andi Kleen */
+        
+        #include <asm/cpufeature.h>             
+/*
+ * memcpy - Copy a memory block.
+ *
+ * Input:       
+ * rdi destination
+ * rsi source
+ * rdx count
+ * 
+ * Output:
+ * rax original destination
+ */     
+        .globl __memcpy
+        .globl memcpy
+        .p2align 4
+__memcpy:
+memcpy:         
+        pushq %rbx
+        movq %rdi,%rax
+        movl %edx,%ecx
+        shrl $6,%ecx
+        jz .Lhandle_tail
+        
+        .p2align 4
+.Lloop_64:
+        decl %ecx
+        
+        movq (%rsi),%r11
+        movq 8(%rsi),%r8
+        movq %r11,(%rdi)
+        movq %r8,1*8(%rdi)
+        movq 2*8(%rsi),%r9
+        movq 3*8(%rsi),%r10
+        movq %r9,2*8(%rdi)
+        movq %r10,3*8(%rdi)
+                
+        movq 4*8(%rsi),%r11
+        movq 5*8(%rsi),%r8
+        movq %r11,4*8(%rdi)
+        movq %r8,5*8(%rdi)
+        movq 6*8(%rsi),%r9
+        movq 7*8(%rsi),%r10
+        movq %r9,6*8(%rdi)
+        movq %r10,7*8(%rdi)
+        leaq 64(%rsi),%rsi
+        leaq 64(%rdi),%rdi
+        jnz  .Lloop_64
+.Lhandle_tail:
+        movl %edx,%ecx
+        andl $63,%ecx
+        shrl $3,%ecx
+        jz   .Lhandle_7
+        .p2align 4
+.Lloop_8: 
+        decl %ecx
+        movq (%rsi),%r8
+        movq %r8,(%rdi) 
+        leaq 8(%rdi),%rdi
+        leaq 8(%rsi),%rsi
+        jnz  .Lloop_8
+.Lhandle_7:
+        movl %edx,%ecx
+        andl $7,%ecx
+        jz .Lende
+        .p2align 4
+.Lloop_1:
+        movb (%rsi),%r8b
+        movb %r8b,(%rdi) 
+        incq %rdi
+        incq %rsi
+        decl %ecx
+        jnz .Lloop_1
+        
+.Lende:         
+        popq %rbx
+        ret
+.Lfinal:
+        
+        /* C stepping K8 run faster using the string copy instructions.
+           It is also a lot simpler. Use this when possible */
+        
+        .section .altinstructions,"a"
+        .align 8
+        .quad  memcpy
+        .quad  memcpy_c
+        .byte  X86_FEATURE_K8_C
+        .byte  .Lfinal-memcpy
+        .byte  memcpy_c_end-memcpy_c    
+        .previous
+        .section .altinstr_replacement,"ax"
+ /* rdi destination
+  * rsi source
+  * rdx count
+  */                    
+memcpy_c:
+        movq %rdi,%rax
+        movl %edx,%ecx
+        shrl $3,%ecx
+        andl $7,%edx    
+        rep 
+        movsq 
+        movl %edx,%ecx
+        rep
+        movsb
+        ret
+memcpy_c_end:
+        .previous
diff --git a/arch/x86_64/lib/memmove.c b/arch/x86_64/lib/memmove.c
new file mode 100644
index 000000000000..e93d5255fdc9
--- /dev/null
+++ b/arch/x86_64/lib/memmove.c
@@ -0,0 +1,19 @@
+/* Normally compiler builtins are used, but sometimes the compiler calls out
+   of line code. Based on asm-i386/string.h.
+ */
+#define _STRING_C
+#include <linux/string.h>
+#undef memmove
+void *memmove(void * dest,const void *src,size_t count)
+{
+        if (dest < src) { 
+                __inline_memcpy(dest,src,count);
+        } else {
+                char *p = (char *) dest + count;
+                char *s = (char *) src + count;
+                while (count--)
+                        *--p = *--s;
+        }
+        return dest;
+} 
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
new file mode 100644
index 000000000000..4b4c40638640
--- /dev/null
+++ b/arch/x86_64/lib/memset.S
@@ -0,0 +1,125 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+/*
+ * ISO C memset - set a memory block to a byte value.
+ *      
+ * rdi   destination
+ * rsi   value (char) 
+ * rdx   count (bytes) 
+ * 
+ * rax   original destination
+ */     
+        .globl __memset
+        .globl memset
+        .p2align 4
+memset: 
+__memset:
+        movq %rdi,%r10
+        movq %rdx,%r11
+        /* expand byte value  */
+        movzbl %sil,%ecx
+        movabs $0x0101010101010101,%rax
+        mul    %rcx             /* with rax, clobbers rdx */
+        /* align dst */
+        movl  %edi,%r9d         
+        andl  $7,%r9d   
+        jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+        
+        movl %r11d,%ecx
+        shrl $6,%ecx
+        jz       .Lhandle_tail
+        .p2align 4
+.Lloop_64:      
+        decl   %ecx
+        movq  %rax,(%rdi) 
+        movq  %rax,8(%rdi) 
+        movq  %rax,16(%rdi) 
+        movq  %rax,24(%rdi) 
+        movq  %rax,32(%rdi) 
+        movq  %rax,40(%rdi) 
+        movq  %rax,48(%rdi) 
+        movq  %rax,56(%rdi) 
+        leaq  64(%rdi),%rdi
+        jnz    .Lloop_64
+        /* Handle tail in loops. The loops should be faster than hard
+           to predict jump tables. */ 
+        .p2align 4         
+.Lhandle_tail:
+        movl    %r11d,%ecx
+        andl    $63&(~7),%ecx
+        jz              .Lhandle_7
+        shrl    $3,%ecx
+        .p2align 4
+.Lloop_8:
+        decl   %ecx
+        movq  %rax,(%rdi)
+        leaq  8(%rdi),%rdi
+        jnz    .Lloop_8
+.Lhandle_7:
+        movl    %r11d,%ecx
+        andl    $7,%ecx
+        jz      .Lende
+        .p2align 4
+.Lloop_1:
+        decl    %ecx
+        movb    %al,(%rdi)
+        leaq    1(%rdi),%rdi
+        jnz     .Lloop_1
+        
+.Lende: 
+        movq    %r10,%rax
+        ret
+.Lbad_alignment:
+        cmpq $7,%r11
+        jbe     .Lhandle_7
+        movq %rax,(%rdi)        /* unaligned store */
+        movq $8,%r8                     
+        subq %r9,%r8 
+        addq %r8,%rdi
+        subq %r8,%r11
+        jmp .Lafter_bad_alignment
+        /* C stepping K8 run faster using the string instructions.
+           It is also a lot simpler. Use this when possible */
+#include <asm/cpufeature.h>     
+                
+        .section .altinstructions,"a"
+        .align 8
+        .quad  memset
+        .quad  memset_c
+        .byte  X86_FEATURE_K8_C
+        .byte  memset_c_end-memset_c
+        .byte  memset_c_end-memset_c
+        .previous
+        .section .altinstr_replacement,"ax"
+ /* rdi destination
+  * rsi value
+  * rdx count
+  */                    
+memset_c:       
+        movq %rdi,%r9
+        movl %edx,%r8d
+        andl $7,%r8d            
+        movl %edx,%ecx
+        shrl $3,%ecx            
+        /* expand byte value  */
+        movzbl %sil,%esi
+        movabs $0x0101010101010101,%rax
+        mulq   %rsi             /* with rax, clobbers rdx */
+        rep
+        stosq   
+        movl %r8d,%ecx
+        rep
+        stosb
+        movq %r9,%rax
+        ret
+memset_c_end:
+        .previous
diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S
new file mode 100644
index 000000000000..0dee1fdcb162
--- /dev/null
+++ b/arch/x86_64/lib/putuser.S
@@ -0,0 +1,89 @@
+/*
+ * __put_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+/*
+ * __put_user_X
+ *
+ * Inputs:      %rcx contains the address
+ *              %rdx contains new value
+ *
+ * Outputs:     %rax is error code (0 or -EFAULT)
+ *
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/offset.h>
+#include <asm/thread_info.h>
+        .text
+        .p2align 4
+.globl __put_user_1
+__put_user_1:
+        GET_THREAD_INFO(%r8)
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae bad_put_user
+1:      movb %dl,(%rcx)
+        xorl %eax,%eax
+        ret
+        .p2align 4
+.globl __put_user_2
+__put_user_2:
+        GET_THREAD_INFO(%r8)
+        addq $1,%rcx
+        jc bad_put_user
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae      bad_put_user
+2:      movw %dx,-1(%rcx)
+        xorl %eax,%eax
+        ret
+        .p2align 4
+.globl __put_user_4
+__put_user_4:
+        GET_THREAD_INFO(%r8)
+        addq $3,%rcx
+        jc bad_put_user
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae bad_put_user
+3:      movl %edx,-3(%rcx)
+        xorl %eax,%eax
+        ret
+        .p2align 4
+.globl __put_user_8
+__put_user_8:
+        GET_THREAD_INFO(%r8)
+        addq $7,%rcx
+        jc bad_put_user
+        cmpq threadinfo_addr_limit(%r8),%rcx
+        jae     bad_put_user
+4:      movq %rdx,-7(%rcx)
+        xorl %eax,%eax
+        ret
+bad_put_user:
+        movq $(-EFAULT),%rax
+        ret
+.section __ex_table,"a"
+        .quad 1b,bad_put_user
+        .quad 2b,bad_put_user
+        .quad 3b,bad_put_user
+        .quad 4b,bad_put_user
+.previous
diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S
new file mode 100644
index 000000000000..acc1e2ca7ed7
--- /dev/null
+++ b/arch/x86_64/lib/thunk.S
@@ -0,0 +1,95 @@
+        /*
+         * Save registers before calling assembly functions. This avoids
+         * disturbance of register allocation in some inline assembly constructs.
+         * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+         * Subject to the GNU public license, v.2. No warranty of any kind.
+         * $Id: thunk.S,v 1.2 2002/03/13 20:06:58 ak Exp $
+         */
+        #include <linux/config.h>
+        #include <linux/linkage.h>
+        #include <asm/dwarf2.h>
+        #include <asm/calling.h>                        
+        #include <asm/rwlock.h>
+                
+        /* rdi: arg1 ... normal C conventions. rax is saved/restored. */        
+        .macro thunk name,func
+        .globl \name
+\name:  
+        CFI_STARTPROC
+        SAVE_ARGS
+        call \func
+        jmp  restore
+        CFI_ENDPROC
+        .endm
+        /* rdi: arg1 ... normal C conventions. rax is passed from C. */         
+        .macro thunk_retrax name,func
+        .globl \name
+\name:  
+        CFI_STARTPROC
+        SAVE_ARGS
+        call \func
+        jmp  restore_norax
+        CFI_ENDPROC
+        .endm
+        
+        .section .sched.text
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
+        thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
+        thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
+        thunk rwsem_wake_thunk,rwsem_wake
+        thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
+#endif  
+        thunk do_softirq_thunk,do_softirq
+        
+        thunk __down_failed,__down
+        thunk_retrax __down_failed_interruptible,__down_interruptible
+        thunk_retrax __down_failed_trylock,__down_trylock
+        thunk __up_wakeup,__up
+        
+        /* SAVE_ARGS below is used only for the .cfi directives it contains. */
+        CFI_STARTPROC
+        SAVE_ARGS
+restore:
+        RESTORE_ARGS
+        ret     
+        CFI_ENDPROC
+        
+        CFI_STARTPROC
+        SAVE_ARGS
+restore_norax:  
+        RESTORE_ARGS 1
+        ret
+        CFI_ENDPROC
+#ifdef CONFIG_SMP
+/* Support for read/write spinlocks. */
+        .text
+/* rax: pointer to rwlock_t */  
+ENTRY(__write_lock_failed)
+        lock
+        addl $RW_LOCK_BIAS,(%rax)
+1:      rep
+        nop
+        cmpl $RW_LOCK_BIAS,(%rax)
+        jne 1b
+        lock 
+        subl $RW_LOCK_BIAS,(%rax)
+        jnz  __write_lock_failed
+        ret
+/* rax: pointer to rwlock_t */  
+ENTRY(__read_lock_failed)
+        lock
+        incl (%rax)
+1:      rep
+        nop
+        cmpl $1,(%rax)
+        js 1b
+        lock
+        decl (%rax)
+        js __read_lock_failed
+        ret
+#endif
diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c
new file mode 100644
index 000000000000..db8abba1ad81
--- /dev/null
+++ b/arch/x86_64/lib/usercopy.c
@@ -0,0 +1,153 @@
+/* 
+ * User address space access functions.
+ *
+ * Copyright 1997 Andi Kleen <ak@muc.de>
+ * Copyright 1997 Linus Torvalds
+ * Copyright 2002 Andi Kleen <ak@suse.de>
+ */
+#include <asm/uaccess.h>
+/*
+ * Copy a null terminated string from userspace.
+ */
+#define __do_strncpy_from_user(dst,src,count,res)                          \
+do {                                                                       \
+        long __d0, __d1, __d2;                                             \
+        might_sleep();                                                     \
+        __asm__ __volatile__(                                              \
+                "       testq %1,%1\n"                                     \
+                "       jz 2f\n"                                           \
+                "0:     lodsb\n"                                           \
+                "       stosb\n"                                           \
+                "       testb %%al,%%al\n"                                 \
+                "       jz 1f\n"                                           \
+                "       decq %1\n"                                         \
+                "       jnz 0b\n"                                          \
+                "1:     subq %1,%0\n"                                      \
+                "2:\n"                                                     \
+                ".section .fixup,\"ax\"\n"                                 \
+                "3:     movq %5,%0\n"                                      \
+                "       jmp 2b\n"                                          \
+                ".previous\n"                                              \
+                ".section __ex_table,\"a\"\n"                              \
+                "       .align 8\n"                                        \
+                "       .quad 0b,3b\n"                                     \
+                ".previous"                                                \
+                : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),      \
+                  "=&D" (__d2)                                             \
+                : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
+                : "memory");                                               \
+} while (0)
+long
+__strncpy_from_user(char *dst, const char __user *src, long count)
+{
+        long res;
+        __do_strncpy_from_user(dst, src, count, res);
+        return res;
+}
+long
+strncpy_from_user(char *dst, const char __user *src, long count)
+{
+        long res = -EFAULT;
+        if (access_ok(VERIFY_READ, src, 1))
+                __do_strncpy_from_user(dst, src, count, res);
+        return res;
+}
+/*
+ * Zero Userspace
+ */
+unsigned long __clear_user(void __user *addr, unsigned long size)
+{
+        long __d0;
+        might_sleep();
+        /* no memory constraint because it doesn't change any memory gcc knows
+           about */
+        asm volatile(
+                "       testq  %[size8],%[size8]\n"
+                "       jz     4f\n"
+                "0:     movq %[zero],(%[dst])\n"
+                "       addq   %[eight],%[dst]\n"
+                "       decl %%ecx ; jnz   0b\n"
+                "4:     movq  %[size1],%%rcx\n"
+                "       testl %%ecx,%%ecx\n"
+                "       jz     2f\n"
+                "1:     movb   %b[zero],(%[dst])\n"
+                "       incq   %[dst]\n"
+                "       decl %%ecx ; jnz  1b\n"
+                "2:\n"
+                ".section .fixup,\"ax\"\n"
+                "3:     lea 0(%[size1],%[size8],8),%[size8]\n"
+                "       jmp 2b\n"
+                ".previous\n"
+                ".section __ex_table,\"a\"\n"
+                "       .align 8\n"
+                "       .quad 0b,3b\n"
+                "       .quad 1b,2b\n"
+                ".previous"
+                : [size8] "=c"(size), [dst] "=&D" (__d0)
+                : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
+                  [zero] "r" (0UL), [eight] "r" (8UL));
+        return size;
+}
+unsigned long clear_user(void __user *to, unsigned long n)
+{
+        if (access_ok(VERIFY_WRITE, to, n))
+                return __clear_user(to, n);
+        return n;
+}
+/*
+ * Return the size of a string (including the ending 0)
+ *
+ * Return 0 on exception, a value greater than N if too long
+ */
+long strnlen_user(const char __user *s, long n)
+{
+        long res = 0;
+        char c;
+        if (!access_ok(VERIFY_READ, s, n))
+                return 0;
+        while (1) {
+                if (res>n)
+                        return n+1;
+                if (__get_user(c, s))
+                        return 0;
+                if (!c)
+                        return res+1;
+                res++;
+                s++;
+        }
+}
+long strlen_user(const char __user *s)
+{
+        long res = 0;
+        char c;
+        for (;;) {
+                if (get_user(c, s))
+                        return 0;
+                if (!c)
+                        return res+1;
+                res++;
+                s++;
+        }
+}
+unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
+{
+        if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { 
+                return copy_user_generic((__force void *)to, (__force void *)from, len);
+        } 
+        return len;             
+}